예제 #1
0
파일: manager.py 프로젝트: ildap/parser
 def __init__(self):
     self.db = dbconnecter()
     if self.db.connect(self.data['postgres']):
         self.rutracker = Jumper(url='http://rutracker.org/forum/viewforum.php', proxies=None)
         self.kinopoisk = Jumper('http://www.kinopoisk.ru/index.php', None)
         self.reCompile()
         self.posterurl = self.data['poster']
         self.staticpath = self.data["static"]
         # self.rutracker.cookies = self.rutracker.post(url='http://login.rutracker.org/forum/login.php',logindata=self.data['login'])
         self.rutracker.setcookiejar_from_dict(self.data['cookies'])
         self.start()
예제 #2
0
파일: main.py 프로젝트: ildap/gks
import datetime

from scrapy.selector import Selector
import matplotlib.pyplot as plt
import aprox
from spider import Jumper

gks_ru = Jumper(url='http://www.gks.ru/dbscripts/cbsd/dbinet.cgi', proxies=None)
data = 'rdLayoutType=Au&_Pokazateli=on&_okato=on&_grtov=on&_god=on&_period=on&a_Pokazateli=1&a_okato=2&a_grtov=3&Qry=Pokazateli%3A1921002%3Bokato%3A80000000%3Bgrtov%3A1501%3Bgod%3A2011%2C2012%2C2013%2C2014%2C2015%3Bperiod%3A12011%2C22011%2C32011%2C42011%2C52011%2C62011%2C72011%2C82011%2C92011%2C102011%2C112011%2C122011%2C132011%2C142011%2C152011%2C162011%2C172011%2C182011%2C192011%2C202011%2C212011%2C222011%2C232011%2C242011%2C252011%2C262011%2C272011%2C282011%2C292011%2C302011%2C312011%2C322011%2C332011%2C342011%2C352011%2C362011%2C372011%2C382011%2C392011%2C402011%2C412011%2C422011%2C432011%2C442011%2C452011%2C462011%2C472011%2C482011%2C492011%2C502011%2C512011%2C12012%2C22012%2C32012%2C42012%2C52012%2C62012%2C72012%2C82012%2C92012%2C102012%2C112012%2C122012%2C132012%2C142012%2C152012%2C162012%2C172012%2C182012%2C192012%2C202012%2C212012%2C222012%2C232012%2C242012%2C252012%2C262012%2C272012%2C282012%2C292012%2C302012%2C312012%2C322012%2C332012%2C342012%2C352012%2C362012%2C372012%2C382012%2C392012%2C402012%2C412012%2C422012%2C432012%2C442012%2C452012%2C462012%2C472012%2C482012%2C492012%2C502012%2C512012%2C12013%2C22013%2C32013%2C42013%2C52013%2C62013%2C72013%2C82013%2C92013%2C102013%2C112013%2C122013%2C132013%2C142013%2C152013%2C162013%2C172013%2C182013%2C192013%2C202013%2C212013%2C222013%2C232013%2C242013%2C252013%2C262013%2C272013%2C282013%2C292013%2C302013%2C312013%2C322013%2C332013%2C342013%2C352013%2C362013%2C372013%2C382013%2C392013%2C402013%2C412013%2C422013%2C432013%2C442013%2C452013%2C462013%2C472013%2C482013%2C492013%2C502013%2C512013%2C522013%2C12014%2C22014%2C32014%2C42014%2C52014%2C62014%2C72014%2C82014%2C92014%2C102014%2C112014%2C122014%2C132014%2C142014%2C152014%2C162014%2C172014%2C182014%2C192014%2C202014%2C212014%2C222014%2C232014%2C242014%2C252014%2C262014%2C272014%2C282014%2C292014%2C302014%2C312014%2C322014%2C332014%2C342014%2C352014%2C362014%2C372014%2C382014%2C392014%2C402014%2C412014%2C422014%2C432014%2C442014%2C452014%2C462014%2C472014%2C482014%2C492014%2C502014%2C512014%2C12015%2C22015%2C32015%2C42015%2C52015%2C62015%2C72015%2C82015%2C92015%2C102015%2C112015%2C122015%2C132015%2C142015%2C152015%2C162015%2C172015%2C182015%2C192015%2C202015%2C212015%2C222015%2C232015%2C242015%2C252015%2C262015%2C272015%2C282015%2C292015%2C302015%2C312015%2C322015%2C332015%2C342015%2C352015%2C362015%2C372015%2C382015%2C392015%2C402015%2C412015%2C422015%2C432015%2C442015%2C452015%2C462015%2C472015%2C482015%3B&QryGm=Pokazateli_z%3A1%3Bokato_z%3A2%3Bgrtov_z%3A3%3Bgod_s%3A1%3Bperiod_b%3A1%3B&QryFootNotes=%3B&YearsList=2011%3B2012%3B2013%3B2014%3B2015%3B&tbl=%CF%EE%EA%E0%E7%E0%F2%FC+%F2%E0%E1%EB%E8%F6%F3'
content = gks_ru.post(None,data,True)
#file = open('content.html')
#content = file.read()
content = content.decode('cp1251')
content = content.replace(u",", u'.')

xpath = '//table[@class="OutTbl"]/tr[1]/td/text()'
years = [int(td.extract()) for td in Selector(text=content).xpath(xpath)]
values = []
weeks = []
print years
xpath = '//table[@class="OutTbl"]/tr'
count = 0
param = 2
column = 0
for tr in Selector(text=content).xpath(xpath)[1:]:
    count += 1
    xpath = u'//td[{}]/text()'.format(param)
    tdn = Selector(text=tr.extract()).xpath(xpath).extract()[0]
    if tdn != u'\xa0':
        values.append(float(tdn))
    else:
예제 #3
0
파일: manager.py 프로젝트: ildap/parser
class Manager:
    path = os.path.dirname(os.path.abspath(__file__))
    data = ast.literal_eval(open(path + r'\data.txt').read())

    def __init__(self):
        self.db = dbconnecter()
        if self.db.connect(self.data['postgres']):
            self.rutracker = Jumper(url='http://rutracker.org/forum/viewforum.php', proxies=None)
            self.kinopoisk = Jumper('http://www.kinopoisk.ru/index.php', None)
            self.reCompile()
            self.posterurl = self.data['poster']
            self.staticpath = self.data["static"]
            # self.rutracker.cookies = self.rutracker.post(url='http://login.rutracker.org/forum/login.php',logindata=self.data['login'])
            self.rutracker.setcookiejar_from_dict(self.data['cookies'])
            self.start()
            # print self.db.get('*','ruparser_topic','1=1')

    def start(self):
        for i in range(0, 300, 50):

            args = {'f': 2093, 'start': i}
            print 'get pos', i,
            content = self.rutracker.jump(args)

            if content is not None:
                print 'ok'
                self.parseTopics(content)
                tm = datetime.now()
                tm = tm.strftime("%d.%m.%y_%H-%M")
                self._save(self.path + r'\html\topics{}_{}.html'.format(args['start'], str(tm)), content)

    def reCompile(self):
        self.resc = re.compile(r'GB', re.UNICODE)
        self.rescfl = re.compile(r'[0.0-9]+', re.UNICODE)
        self.rename = re.compile('[\S]+[^/^(]+', re.UNICODE)
        self.reyear = re.compile(r'(?<=\[)\d{4}(?!\d)', re.UNICODE)
        self.renum = re.compile(r'\d+', re.UNICODE)
        self.retime = re.compile(r'\d+:\d+')
        self.resub = re.compile(r'<wbr>', re.UNICODE)
        self.req = re.compile(r'\'', re.UNICODE)

    def getPoster(self, id):
        id = str(id)
        raw = self.kinopoisk.raw(self.posterurl + str(id) + '.jpg')
        dir = self.staticpath + id[0:2] + '/' + id[2:4] + '/' + id[4] + '/'
        try:
            os.stat(dir)
        except:
            os.makedirs(dir)
        file = open(dir + id[5] + '.jpg', 'wb')
        file.writelines(raw)

    def dbfind(self, name, year):
        where = " name like '" + name.encode('utf8') + "%' and year='" + str(year) + "'"
        fid = self.db.getID('ruparser_film', where)
        if fid:
            print 'was found in dbase id=', fid
            return fid
        else:
            args = {u'level': u'7',
                    u'first': u'yes',
                    u'from': u'forma',
                    u'result': u'adv',
                    u'm_act[from]': u'forma',
                    u'm_act[what]': u'content',
                    u'm_act[find]': name.encode('utf8'),
                    u'm_act[year]': year.encode('utf8')}
            content = self.kinopoisk.jump(args)
            # file = open('force.html')
            # content = file.read()
            if content is not None:
                return self.kinopars(name, year, content)

    def kinopars(self, name, year, content):
        content = content.decode('cp1251')
        content = content.encode('utf8')
        content = self.resub.sub('', content)

        def check(obj):
            if obj:
                return obj[0]
            else:
                return ''

        xpath = '//link[@rel="canonical"]/@href'
        id = Selector(text=content).xpath(xpath).extract()
        if id:
            id = self.renum.findall(id[0])[0]
        xpath = '//div[@class="brand_words"][@itemprop="description"]/text()'
        text = check(Selector(text=content).xpath(xpath).extract())
        text = self.req.sub('', text)

        xpath = '//span[@class="rating_ball"]/text()'
        rating = check(Selector(text=content).xpath(xpath).extract())
        if text and rating and id:
            print 'was found on kinopoisk.ru/film', id
            xpath = '//a[@class="popupBigImage"]/img/@src'
            poster = check(Selector(text=content).xpath(xpath).extract())
            if poster == 'http://st.kp.yandex.net/images/movies/poster_none.png':
                poster = u'false'
            else:
                poster = u'true'

            xpath = '//span[@class="ratingCount"]/text()'
            count = Selector(text=content).xpath(xpath).extract()

            if count:
                count = count[0]
                count = count.replace(u'\xa0', u'')
            else:
                count = 0;
            print 'rating', rating, count,
            xpath = '//td[@class="time"]/text()'
            time = Selector(text=content).xpath(xpath).extract()
            nulltime = '0:0'
            if len(time) > 1:
                time = self.retime.findall(time[1])
                if len(time) >= 1:
                    time = time[0]
                else:
                    time = nulltime

            elif len(time) == 1:
                time = self.renum.findall(time[0])
                if len(time) >= 1:
                    time = int(time[0])
                    th = time / 60
                    tm = time - (th * 60)
                    time = str(th) + ':' + str(tm)
                else:
                    time = nulltime
            else:
                time = nulltime

            print 'time', time,

            xpath = '//div[@id="block_rating"]/div[1]/div[2]/text()'
            imdb = check(Selector(text=content).xpath(xpath).extract())

            if imdb:
                imdb = float(self.rescfl.findall(imdb)[0])
            else:
                imdb = 0;
            print 'imdb:', imdb
            head = '(name,year,text,rating,count,imdb,time,kinopoiskid,poster)'
            values = (name.encode('utf8'), year, text.encode('utf8'), rating, count, imdb, time, id, poster)
            fid = self.db.insert('ruparser_film', head, values)
            return fid

    def kinosearch(self, fullname):
        try:
            name = self.rename.findall(fullname)[0]
            year = self.reyear.findall(fullname)[0]
            print 'search', name, year,
            return self.dbfind(name, year)
        except IndexError, er:
            print 'kinosearch', er
            return None