Python ModelSQLite примеры, controlador_vista.ModelSQLite Python примеры использования

Пример #1

0

Показать файл

class manuelturizoadn(scrapy.Spider):
    name = 'manuelturizoadn'
    _num_pagina = 1
    start_urls = ['https://downloadmanuelturizoadn.wordpress.com']
    
    id_domin = 5
    nombre_trelacion = 'dominios'
    #####CREA OBJETO#####
    c = Controler(ModelSQLite(), View())
    
    
    def __init__(self, name=None, **kwargs):
        #####CREA TABLA#####
        self.c.crea_tabla(self.nombre_trelacion)
        if self.c.existe_id(self.id_domin, self.nombre_trelacion) == False:
        #####TOMA LA FECHA ACTUAL#####
            hoy = date.today().strftime("%d %B, %Y")
        #####INSERTA EN LA TABLA RELACIONAL#####
            #id_trelacion = c.ultimo_id(self.nombre_trelacion)
            self.c.inserta_item_relacional(self.id_domin, self.start_urls[0], hoy, self.nombre_trelacion)
            
            
            
    def parse(self, response):
        #####COMENTARIOS#####
        print('\n########Pagina ' + str(self._num_pagina) + '########')
        #####TOMA LOS DATOS DE LA PÁGINA#####
        titulo = response.css('h1.entry-title ::text').get()
        referer = response.css('figure > a ::attr(href)').get()
        fecha = response.css('time.entry-date.published.updated ::text').get()
        cantante, album = separa_titulo(titulo, '–')
        album = separa(album,' ', 1)
        
        #####LLAMA AL REFERER#####
        yield scrapy.Request(referer, callback= self.parse_attr, meta= {'fecha': fecha, 'referer': referer, 'titulo': titulo, 'cantante': cantante, 'album': album})
        
        
    def parse_attr(self, response):
        inf = response.css('button#download-btn ::attr(onclick)').get()
        infringing = separa(inf, "'", 1)
        
        for tb in response.css('td.column-title ::text'):
            titulo = tb.get()
            print('\n*****************DATOS*****************')
            print('infringing: ' + infringing)
            print('referer: ' + response.meta['referer'])
            print('titulo: ' + titulo)
            print('fecha: ' + response.meta['fecha'])
            print('cantante: ' + response.meta['cantante'])
            print('album: ' + response.meta['album'])
            print('***************************************\n')
            
            #####INSERTA EN BD#####
            if veri(infringing) == True:
                self.c.inserta_item(titulo, response.meta['cantante'], response.meta['album'],response.meta['referer'], infringing, response.meta['fecha'], self.id_domin)

Пример #2

0

Показать файл

 def __init__(self, name=None, **kwargs):
     #####CREA OBJETO#####
     c = Controler(ModelSQLite(), View())
     #####CREA TABLAS#####
     c.crea_tabla(self.nombre_trelacion)
     #c.crea_tabla(self.nombre_tabla, self.nombre_trelacion)
     if c.existe_id(self.id_domin, self.nombre_trelacion) == False:
         #####TOMA LA FECHA ACTUAL#####
         hoy = date.today().strftime("%d %B, %Y")
         #####INSERTA EN LA TABLA RELACIONAL#####
         c.inserta_item_relacional(self.id_domin, self.start_urls[0], hoy,
                                   self.nombre_trelacion)

Пример #3

0

Показать файл

Файл: alvarosolereterno.py Проект: Jessik167/Scrapy

class alvarosolereterno(scrapy.Spider):
    name = 'alvarosolereterno'
    _num_pagina = 1
    start_urls = ['https://alvarosolereternoagostoalbummp3.wordpress.com/']

    id_domin = 12
    nombre_trelacion = 'dominios'
    #####CREA OBJETO#####
    c = Controler(ModelSQLite(), View())

    def __init__(self, name=None, **kwargs):
        #####CREA TABLA#####
        self.c.crea_tabla(self.nombre_trelacion)
        if self.c.existe_id(self.id_domin, self.nombre_trelacion) == False:
            #####TOMA LA FECHA ACTUAL#####
            hoy = date.today().strftime("%d %B, %Y")
            #####INSERTA EN LA TABLA RELACIONAL#####
            self.c.inserta_item_relacional(self.id_domin, self.start_urls[0],
                                           hoy, self.nombre_trelacion)

    def parse(self, response):
        #####COMENTARIOS#####
        print('\n########Pagina ' + str(self._num_pagina) + '########')
        referer = response.css('figure > a ::attr(href)').get()
        yield scrapy.Request(referer, callback=self.parse_attr)

    def parse_attr(self, response):
        Referer = response.url
        Artista = response.xpath(
            '/html/body/div/div[2]/div/div[2]/span[2]/text()').extract_first()
        Album = response.xpath(
            '/html/body/div/div[2]/div/div[2]/span[4]/text()').extract_first()
        Fecha = response.xpath(
            '/html/body/div/div[2]/div/div[2]/span[6]/text()').extract_first()
        Infringing = response.css(
            'div#download-btn-div :nth-child(4) ::attr(onclick)').get()
        Infringing = separa(Infringing, '"', 1)

        for tr in response.css('tbody > tr :nth-child(1)'):
            Cancion = tr.css('::text').get()
            #####IMPRIME INFORMACIÓN#####
            imprime_datos(Cancion, Fecha, Artista, Album, Referer, Infringing)
            #####INSERTA EN BD#####
            if self.c.existe_inf(Infringing, self.id_domin) == False:
                if veri(Infringing) == True:
                    self.c.inserta_item(Cancion, Artista, Album, Referer,
                                        Infringing, Fecha, self.id_domin)

Пример #4

0

Показать файл

    def parse(self, response):
        #####VARIABLE#####
        te_item = SpidersItem()
        c = Controler(ModelSQLite(), View())

        #####COMENTARIOS#####
        print('\n########Pagina ' + str(self._num_pagina) + '########')

        #####HACE UN LOOP POR PÁGINA EN BÚSCA DE LOS DATOS#####
        for art in response.css('article.post.excerpt'):
            #####IMPRIME LOS RESULTADOS####
            yield scrapy.Request(art.css('a ::attr(href)').get(),
                                 callback=self.parse_attr,
                                 meta={
                                     'item': te_item,
                                     'controler': c
                                 })
            #break
        print('')

        self._num_pagina += 1

        for next_page in response.css('a.next.page-numbers'):
            yield response.follow(next_page, self.parse)

Пример #5

0

Показать файл

from datetime import date
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from verifica_link import veri, separa_titulo, separa, strip_spaces, imprime_datos
from controlador_vista import ModelSQLite, View, Controler

name = 'softarchive'
_num_pagina = 1
start_urls = ['https://softarchive.unblocked.ltda/music/']

id_domin = 13
nombre_trelacion = 'dominios'
#####CREA OBJETO#####
c = Controler(ModelSQLite(), View())

#####CREA TABLA#####
c.crea_tabla(nombre_trelacion)
if c.existe_id(id_domin, nombre_trelacion) == False:
#####TOMA LA FECHA ACTUAL#####
    hoy = date.today().strftime("%d %B, %Y")
#####INSERTA EN LA TABLA RELACIONAL#####
    c.inserta_item_relacional(id_domin, start_urls[0], hoy, nombre_trelacion)

#####COMENTARIOS#####
print('\n########Pagina ' + str(_num_pagina) + '########')
#####ABRE NAVEGADOR SELENIUM#####
driver = webdriver.Chrome('C:\\Users\\APDIF\\Desktop\\chromedriver.exe')
driver.get(start_urls[0])
#####ESPERA A QUE CARGUE LA PÁGINA#####

Пример #6

0

Показать файл

Файл: bjholver10.py Проект: Jessik167/Scrapy

class bjholver10(scrapy.Spider):
    name = 'bjholver10'
    _num_pagina = 1
    start_urls = ['https://bjholver10.blogspot.com/2019/07/descarga-tu-musica-y-regueton-espero.html']
    
    id_domin = 8
    nombre_trelacion = 'dominios'
    #####CREA OBJETO#####
    c = Controler(ModelSQLite(), View())
    
    def __init__(self, name=None, **kwargs):
        #####CREA TABLA#####
        self.c.crea_tabla(self.nombre_trelacion)
        if self.c.existe_id(self.id_domin, self.nombre_trelacion) == False:
        #####TOMA LA FECHA ACTUAL#####
            hoy = date.today().strftime("%d %B, %Y")
        #####INSERTA EN LA TABLA RELACIONAL#####
            #id_trelacion = c.ultimo_id(self.nombre_trelacion)
            self.c.inserta_item_relacional(self.id_domin, self.start_urls[0], hoy, self.nombre_trelacion)
                 
            
    def parse(self, response):
        titulo = []
        cantante = []
        album = []
        referer = []
        infringing = []
        fecha = date.today().strftime("%B %d, %Y")
        #####COMENTARIOS#####
        print('\n########Pagina ' + str(self._num_pagina) + '########')
        #prueba = response.css("[style = 'text-align: center; outline: rgb(33, 198, 243) none 0px;'] > span").get()
        for span in response.css('div#post-body-5669324029259817671 > div > div'):
            text = span.css('span ::text').get()
            #print(str(text))
            if text is not None:
                if text != 'DESCARGAR' and text[0] != '0':
                    titulo.append(text)
                    can, alb = separa_titulo(titulo[-1], '-')
                    if can == '-':
                        can, alb = separa_titulo(titulo[-1], '–')
                    cantante.append(can)
                    album.append(alb)
                    #print('agrega titulo: ' + titulo[-1])
                    #print('agrega cantante: ' + cantante[-1])
                    #print('agrega album: ' + album[-1])
                text_des = span.css('span > a ::text').get()
                text_des1 = span.css('a ::text').get()
                if text_des == 'DESCARGAR' or text_des1 == 'DESCARGAR':
                    #print(str(span.css('span > a ::attr(href)').get()))
                    if span.css('span > a ::attr(href)').get() is not None:
                        referer.append(str(span.css('span > a ::attr(href)').get()))
                    elif span.css('a ::attr(href)').get() is not None:
                        referer.append(str(span.css('a ::attr(href)').get()))
                    if referer is not None:
                        infringing.append(self.get_inf(referer[-1]))
                    #print('agrega referer: ' + referer[-1])
                    #print('agrega infringing: ' + infringing[-1])
                    
        self.get_datos(titulo, fecha, cantante, album, referer, infringing)
        
        
    def get_datos(self, titulo, fecha, cantante, album, referer, infringing):
        #print('LEN: ' + str(len(titulo)))
        #print('LEN: ' + str(len(infringing)))
        #print('LEN: ' + str(len(referer)))
        #print('LEN: ' + str(len(cantante)))
        #print('LEN: ' + str(len(album)))
        #print('titulo: ' + titulo[-1])
        for i in range(len(infringing)):
            print('\n*****************DATOS*****************')
            print('infringing: ' + infringing[i])
            print('referer: ' + referer[i])
            print('titulo: ' + titulo[i])
            print('fecha: ' + fecha)
            print('cantante: ' + cantante[i])
            print('album: ' + album[i])
            print('***************************************\n')
            #####INSERTA EN BD#####
            if veri(infringing[i]) == True:
                if self.c.existe_inf(infringing[i], self.id_domin) == False:
                    self.c.inserta_item(titulo[i], cantante[i], album[i], referer[i], infringing[i], fecha, self.id_domin)
            
    
    def get_inf(self, url):
        response = requests.get(url)
        return str(response.request.url)

Пример #7

0

Показать файл

Файл: produccionesortega.py Проект: Jessik167/Scrapy

class produccionesortega(scrapy.Spider):
    name = 'produccionesortega'
    _num_pagina = 1
    start_urls = ['http://produccionesortega507.com']
    
    id_domin = 6
    nombre_trelacion = 'dominios'
    #####CREA OBJETO#####
    c = Controler(ModelSQLite(), View())
    
    def __init__(self, name=None, **kwargs):
        #####CREA TABLA#####
        self.c.crea_tabla(self.nombre_trelacion)
        if self.c.existe_id(self.id_domin, self.nombre_trelacion) == False:
        #####TOMA LA FECHA ACTUAL#####
            hoy = date.today().strftime("%d %B, %Y")
        #####INSERTA EN LA TABLA RELACIONAL#####
            #id_trelacion = c.ultimo_id(self.nombre_trelacion)
            self.c.inserta_item_relacional(self.id_domin, self.start_urls[0], hoy, self.nombre_trelacion)
       
            
            
    def parse(self, response):
        id = '0'
        #####COMENTARIOS#####
        print('\n########Pagina ' + str(self._num_pagina) + '########')
        for art in response.xpath('//*[@id="content"]/div[1]/article'):
            band = True
            titulo = art.css(' h2 > a ::text').get()
            fecha = art.css('p > span > a ::text').get()
            album, cantante = separa_titulo(titulo, '-')
            referer = art.css('div > div.post-entry-content > a ::attr(href)').get()
            id = self.get_id(referer,'/', 3)
            infringing = self.get_infr(id)
            if referer is None:
                referer = art.css('div > div > p > strong > a ::attr(href)').get()
                if referer is None:
                    #print('PATH: ' + str(art.xpath('//div/div/p[2]/a').extract()))
                    #print('A: ' + str(art.css('div.post-entry-content').get()))
                    for p in art.css('div.post-entry-content > strong > span > span > a') or art.css('div.post-entry-content > p > a'):
                        band = False
                        #print ('A: ' + str(art.css('div.post-entry-content').get()))
                        referer = p.css('::attr(href)').get()
                        id = self.get_id(referer,'/', 3)
                        infringing = self.get_infr(id)
                        #print('REF: ' + referer)
                        if self.comprueba_refer(referer, 'open') == True:
                            r = art.css('div.post-entry-content > p')
                            #print('A: ' + str(art.css('div.post-entry-content > p > strong').get()))
                            if r.css('strong') is not None:
                                r = r.css('strong > span')
                            referer = r.css('a ::attr(href)').get()
                            id = self.get_id(referer,'/', 3)
                            infringing = self.get_infr(id)
                            #####INSERTA EN BD#####
                            if veri(infringing) == True:
                                if self.c.existe_inf(infringing, self.id_domin) == False:
                                    self.c.inserta_item(titulo, cantante, album, referer, infringing, fecha, self.id_domin)
                            
                        else:
                            id = self.get_id(referer,'/', 3)
                            infringing = self.get_infr(id)
                        self.imprime_datos(titulo, fecha, cantante, album, referer, infringing)
                else:
                    id = self.get_id(referer,'/', 3)
                    infringing = self.get_infr(id)
            else:
                if self.comprueba_refer(referer, '?') == True:
                    infringing = self.get_Mega(referer)
                    
            if band == True:
                #####INSERTA EN BD#####
                if veri(infringing) == True:
                    if self.c.existe_inf(infringing, self.id_domin) == False:
                        self.c.inserta_item(titulo, cantante, album, referer, infringing, fecha, self.id_domin)
    
            self.imprime_datos(titulo, fecha, cantante, album, referer, infringing)
            #referer = None

        self._num_pagina+=1
        try:
            next_page = response.css('a.next.page-numbers ::attr(href)').get()
            if next_page is not None:
                yield response.follow(next_page, callback= self.parse)
        except:
             print('Hubo un problema al abrir la página siguiente')
        
    
    def comprueba_refer(self, ref, cad):
        if ref.find(cad) != -1:
            return True
        else:
            return False
    
    
    def get_Mega(self,ref):
        driver = abre_navegador()
        driver.get(ref)
        time.sleep(8)
        return driver.current_url
    
    
    def imprime_datos(self, titulo, fecha, cantante, album, referer, infringing):
        #####IMPRIME INFORMACIÓN#####
            print('\n*****************DATOS*****************')
            print('infringing: ' + infringing)
            print('referer: ' + str(referer))
            print('titulo: ' + titulo)
            print('fecha: ' + fecha)
            print('cantante: ' + cantante)
            print('album: ' + album)
            print('***************************************\n')
            
    def get_id(self, id, separador, pos):
        if id is not None:
            n_id = id.split(separador)
            return str(n_id[pos])
        else:
            return ''
    
    def get_infr(self, id):
        if id is not None or id != '0':
            response = requests.get('https://musica.produccionesortega507.com/d.php?id=' + id , stream=True)
            #print (response.headers)
            return str(response.request.url)
        else:
            return ""

Пример #8

0

Показать файл

class lvumusic(scrapy.Spider):
    name = 'lvumusic'
    _num_pagina = 1
    start_urls = ['https://www.lvumusic.net']

    id_domin = 7
    nombre_trelacion = 'dominios'
    #####CREA OBJETO#####
    c = Controler(ModelSQLite(), View())

    def __init__(self, name=None, **kwargs):
        #####CREA TABLA#####
        self.c.crea_tabla(self.nombre_trelacion)
        if self.c.existe_id(self.id_domin, self.nombre_trelacion) == False:
            #####TOMA LA FECHA ACTUAL#####
            hoy = date.today().strftime("%d %B, %Y")
            #####INSERTA EN LA TABLA RELACIONAL#####
            #id_trelacion = c.ultimo_id(self.nombre_trelacion)
            self.c.inserta_item_relacional(self.id_domin, self.start_urls[0],
                                           hoy, self.nombre_trelacion)

    def parse(self, response):
        #####COMENTARIOS#####
        print('\n########Pagina ' + str(self._num_pagina) + '########')
        for art in response.css('div.article-container > article'):
            href = art.css('div.featured-image > a ::attr(href)').get()
            titulo = art.css('div.featured-image > a ::attr(title)').get()
            cantante, album = separa_titulo(titulo, '–')
            fecha = art.css(
                'div.below-entry-meta > span > a > time ::text').get()
            yield scrapy.Request(href,
                                 callback=self.parse_attr,
                                 meta={
                                     'fecha': fecha,
                                     'titulo': titulo,
                                     'cantante': cantante,
                                     'album': album
                                 })
            #break
        self._num_pagina += 1
        try:
            next_page = response.css('li.previous > a ::attr(href)').get()
            if next_page is not None:
                yield response.follow(next_page, callback=self.parse)
        except:
            print('Hubo un problema al abrir la página siguiente')

    def parse_attr(self, response):
        href = response.css(
            'div.entry-content.clearfix > center > a ::attr(href)').get()
        yield scrapy.Request(href,
                             callback=self.parse_attr2,
                             meta={
                                 'fecha': response.meta['fecha'],
                                 'titulo': response.meta['titulo'],
                                 'cantante': response.meta['cantante'],
                                 'album': response.meta['album']
                             })

    def parse_attr2(self, response):
        referer = response.url
        infringing = response.css(
            'div.post-body.entry-content > center > table > tbody > tr > td > center > a ::attr(href)'
        ).get()
        imprime_datos(response.meta['titulo'], response.meta['fecha'],
                      response.meta['cantante'], response.meta['album'],
                      referer, infringing)
        #####INSERTA EN BD#####
        if veri(infringing) == True:
            if self.c.existe_inf(infringing, self.id_domin) == False:
                self.c.inserta_item(response.meta['titulo'],
                                    response.meta['cantante'],
                                    response.meta['album'], referer,
                                    infringing, response.meta['fecha'],
                                    self.id_domin)

Пример #9

0

Показать файл

Файл: lamusicamp3.py Проект: Jessik167/Scrapy

class lamusicamp3(scrapy.Spider):
    name = 'lamusicamp3'
    _num_pagina = 1
    start_urls = ['https://lamusicamp3.com/']

    id_domin = 9
    nombre_trelacion = 'dominios'
    #####CREA OBJETO#####
    c = Controler(ModelSQLite(), View())

    def __init__(self, name=None, **kwargs):
        #####CREA TABLA#####
        self.c.crea_tabla(self.nombre_trelacion)
        if self.c.existe_id(self.id_domin, self.nombre_trelacion) == False:
            #####TOMA LA FECHA ACTUAL#####
            hoy = date.today().strftime("%d %B, %Y")
            #####INSERTA EN LA TABLA RELACIONAL#####
            #id_trelacion = c.ultimo_id(self.nombre_trelacion)
            self.c.inserta_item_relacional(self.id_domin, self.start_urls[0],
                                           hoy, self.nombre_trelacion)

    def parse(self, response):
        #####COMENTARIOS#####
        print('\n########Pagina ' + str(self._num_pagina) + '########')
        for a in response.xpath('/html/body/div[1]/a'):
            try:
                titu = a.css('::attr(title)').get()
                if titu is not None and titu != 'Descargar' and titu != 'Ak47Full' and titu != 'iPauta' and titu != 'ElGenero' and titu != 'FlowHot':
                    ref = a.css('::attr(href)').get()
                    hoy = date.today().strftime("%d %B, %Y")
                    cantante, album = separa_titulo(titu, '–')
                    yield scrapy.Request(ref,
                                         callback=self.parse_attr,
                                         meta={
                                             'referer': ref,
                                             'fecha': hoy,
                                             'titulo': titu,
                                             'cantante': cantante,
                                             'album': album
                                         })
            except:
                pass
        self._num_pagina += 1
        try:
            next_page = response.css('a.nextpostslink ::attr(href)').get()
            if next_page is not None:
                yield response.follow(next_page, callback=self.parse)
        except:
            print('Hubo un problema al abrir la página siguiente')

    def parse_attr(self, response):
        if response.css('a.btn-dl'):
            infringing = response.css('a.btn-dl ::attr(href)').get()
            imprime_datos(response.meta['titulo'], response.meta['fecha'],
                          response.meta['cantante'], response.meta['album'],
                          response.meta['referer'], infringing)
            #####INSERTA EN BD#####
            if veri(infringing) == True:
                if self.c.existe_inf(infringing, self.id_domin) == False:
                    self.c.inserta_item(response.meta['titulo'],
                                        response.meta['cantante'],
                                        response.meta['album'],
                                        response.meta['referer'], infringing,
                                        response.meta['fecha'], self.id_domin)

Пример #10

0

Показать файл

class djmixtico(scrapy.Spider):
    name = 'djmixtico'
    _num_pagina = 1
    start_urls = ['https://djmixtico.blogia.com/']

    id_domin = 12
    nombre_trelacion = 'dominios'
    #####CREA OBJETO#####
    c = Controler(ModelSQLite(), View())

    def __init__(self, name=None, **kwargs):
        #####CREA TABLA#####
        self.c.crea_tabla(self.nombre_trelacion)
        if self.c.existe_id(self.id_domin, self.nombre_trelacion) == False:
            #####TOMA LA FECHA ACTUAL#####
            hoy = date.today().strftime("%d %B, %Y")
            #####INSERTA EN LA TABLA RELACIONAL#####
            self.c.inserta_item_relacional(self.id_domin, self.start_urls[0],
                                           hoy, self.nombre_trelacion)

    def parse(self, response):
        #####COMENTARIOS#####
        print('\n########Pagina ' + str(self._num_pagina) + '########')

        for a in response.css('h2 > a'):
            ref = a.css('::attr(href)').get()
            #####LLAMA AL REFERER#####
            yield scrapy.Request(ref,
                                 callback=self.parse_attr,
                                 meta={'referer': ref})

        self._num_pagina += 1
        try:
            next_page = response.xpath(
                '/html/body/main/nav[2]/ul/li[2]/a/@href').get()
            if next_page is None:
                next_page = response.xpath(
                    '/html/body/main/nav/ul/li[2]/a/@href').get()
            #print('NEXT PAGE: ' + str(next_page))
            if next_page is not None:
                yield response.follow(next_page, callback=self.parse)
        except:
            print('Hubo un problema al abrir la página siguiente')

    def parse_attr(self, response):
        titulo = response.css('h1 > a ::text').get()
        cantante, album = separa_titulo(titulo, '-')
        fecha = response.css('time > a ::text').get()
        fecha = strip_spaces(fecha)
        fecha = separa(fecha, '-', 0)
        infringing = response.css(
            'div.post__content > p > a ::attr(href)').get()
        try:
            if infringing.find('images') > 0:
                infringing = response.xpath(
                    '//*[@id="post"]/div[2]/p/a[2]/@href').get()
            if infringing is not None or infringing.find('megaupload') > 0:
                if veri(infringing) == True:
                    imprime_datos(titulo, fecha, cantante, album,
                                  response.meta['referer'], infringing)
                    if self.c.existe_inf(infringing, self.id_domin) == False:
                        self.c.inserta_item(titulo, cantante, album,
                                            response.meta['referer'],
                                            infringing, fecha, self.id_domin)
        except:
            pass

Пример #11

0

Показать файл

Файл: ngleakers.py Проект: Jessik167/Scrapy

class ngleakers(scrapy.Spider):
    name = 'ngleakers'
    _num_pagina = 1
    start_urls = ['http://ngleakers.com/']

    id_domin = 4
    nombre_trelacion = 'dominios'
    #####CREA OBJETO#####
    c = Controler(ModelSQLite(), View())

    def __init__(self, name=None, **kwargs):
        #####CREA TABLA#####
        self.c.crea_tabla(self.nombre_trelacion)
        if self.c.existe_id(self.id_domin, self.nombre_trelacion) == False:
            #####TOMA LA FECHA ACTUAL#####
            hoy = date.today().strftime("%d %B, %Y")
            #####INSERTA EN LA TABLA RELACIONAL#####
            #id_trelacion = c.ultimo_id(self.nombre_trelacion)
            self.c.inserta_item_relacional(self.id_domin, self.start_urls[0],
                                           hoy, self.nombre_trelacion)

    def parse(self, response):
        #####COMENTARIOS#####
        print('\n########Pagina ' + str(self._num_pagina) + '########')

        for art in response.css('div#main-content > article'):
            titulo = art.css('a ::attr(title)').get()
            referer = art.css('a ::attr(href)').get()
            fecha = art.css('span.mh-meta-date updated ::text').get()
            cantante, album = separa_titulo(titulo, '–')

            #####LLAMA AL REFERER#####
            yield scrapy.Request(referer,
                                 callback=self.parse_attr,
                                 meta={
                                     'fecha': fecha,
                                     'referer': referer,
                                     'titulo': titulo,
                                     'cantante': cantante,
                                     'album': album
                                 })
            #break
        self._num_pagina += 1
        try:
            next_page = response.css('div.nav-previous > a ::attr(href)').get()
            #print('PAGINA SIGUIENTE:' + next_page)
            if next_page is not None:
                yield response.follow(next_page, callback=self.parse)
        except:
            print('Hubo un problema al abrir la página siguiente')

    def parse_attr(self, response):
        infringing = response.css('h3 > a ::attr(href)').get()

        print('\n*****************DATOS*****************')
        print('infringing: ' + infringing)
        print('referer: ' + response.meta['referer'])
        print('titulo: ' + response.meta['titulo'])
        print('fecha: ' + str(response.meta['fecha']))
        print('cantante: ' + response.meta['cantante'])
        print('album: ' + response.meta['album'])
        print('***************************************\n')

        #####INSERTA EN BD#####
        if veri(infringing) == True:
            if self.c.existe_inf(infringing, self.id_domin) == False:
                self.c.inserta_item(response.meta['titulo'],
                                    response.meta['cantante'],
                                    response.meta['album'],
                                    response.meta['referer'], infringing,
                                    response.meta['fecha'], self.id_domin)

Пример #12

0

Показать файл

Файл: de-album.py Проект: Jessik167/Scrapy

class de_album(scrapy.Spider):
    name = 'de-album'
    _num_pagina = 1
    start_urls = ['https://de-album.blogspot.com/']

    id_domin = 11
    nombre_trelacion = 'dominios'
    #####CREA OBJETO#####
    c = Controler(ModelSQLite(), View())

    def __init__(self, name=None, **kwargs):
        #####CREA TABLA#####
        self.c.crea_tabla(self.nombre_trelacion)
        if self.c.existe_id(self.id_domin, self.nombre_trelacion) == False:
            #####TOMA LA FECHA ACTUAL#####
            hoy = date.today().strftime("%d %B, %Y")
            #####INSERTA EN LA TABLA RELACIONAL#####
            self.c.inserta_item_relacional(self.id_domin, self.start_urls[0],
                                           hoy, self.nombre_trelacion)

    def parse(self, response):
        #####COMENTARIOS#####
        print('\n########Pagina ' + str(self._num_pagina) + '########')

        for h3 in response.css('h3.post-title') or response.css(
                'h3.post-title.entry-title'):
            #####RECOLECTA LOS DATOS DE LA PÁGINA#####
            referer = h3.css('a ::attr(href)').get()
            titulo = h3.css('a ::text').get()
            #####SEPARA CANTANTE Y ALBUM#####
            cantante, album = separa_titulo(titulo, '-')
            #####LLAMA AL REFERER#####
            yield scrapy.Request(referer,
                                 callback=self.parse_attr,
                                 meta={
                                     'referer': referer,
                                     'titulo': titulo,
                                     'cantante': cantante,
                                     'album': album
                                 })

        self._num_pagina += 1
        try:
            next_page = response.css(
                'a.blog-pager-older-link.flat-button.ripple ::attr(href)').get(
                )
            if next_page is not None:
                yield response.follow(next_page, callback=self.parse)
        except:
            print('Hubo un problema al abrir la página siguiente')

    def parse_attr(self, response):
        #####TOMA LA FECHA#####
        fecha = response.css('time.published ::text').get().strip()

        #####BÚSCA LA PALABRA DOWNLOAD#####
        link = response.xpath('//*/div[28]/b/span/a/@href').extract_first()
        if link is None:
            link = response.css('div > b > span > a ::attr(href)').get()
        if link is None:
            link = response.xpath(
                '//*/div[2]/div/div[13]/a/@href').extract_first()

        prev_inf = link
        #####IMPRIME INFORMACIÓN#####
        imprime_datos(response.meta['titulo'], fecha,
                      response.meta['cantante'], response.meta['album'],
                      response.meta['referer'], prev_inf)
        #####INSERTA EN BD#####
        self.inserta_BD(response, fecha, prev_inf)
        #####LLAMA AL REFERER#####
        infringing = self.Abre_pag(prev_inf)
        imprime_datos(response.meta['titulo'], fecha,
                      response.meta['cantante'], response.meta['album'],
                      response.meta['referer'], infringing)
        #####INSERTA EN BD#####
        self.inserta_BD(response, fecha, infringing)

    def inserta_BD(self, response, fecha, inf):
        if self.c.existe_inf(inf, self.id_domin) == False:
            if veri(inf) == True:
                self.c.inserta_item(response.meta['titulo'],
                                    response.meta['cantante'],
                                    response.meta['album'],
                                    response.meta['referer'], inf, fecha,
                                    self.id_domin)

    def Abre_pag(self, url):
        #####ABRE NAVEGADOR#####
        driver = webdriver.Chrome(
            'C:\\Users\\APDIF\\Desktop\\chromedriver.exe')
        driver.get(url)
        time.sleep(1)
        #####HACE CLICK EN EL BOTON#####
        WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable(
                (By.CSS_SELECTOR, "button#btn-main"))).click()
        time.sleep(2)
        driver.switch_to.window(window_name=driver.window_handles[1])

        #####HACE CLICK EN EL BOTON#####
        try:
            WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable(
                    (By.CSS_SELECTOR, "button#btn-main"))).click()
        except:
            pass

        element = driver.find_element_by_xpath("/html")
        #####ABRE LOS POP UPS#####
        for i in range(10):
            ActionChains(driver).move_to_element(element).click().perform()
            time.sleep(3)
            driver.switch_to.window(window_name=driver.window_handles[1])

        #####HACE CLICK EN EL BOTON#####
        driver.switch_to.window(window_name=driver.window_handles[1])
        time.sleep(10)
        element = driver.find_element_by_css_selector("button#btn-main")
        ActionChains(driver).move_to_element(element).click().perform()
        time.sleep(3)
        #####TOMA EL LINK MEGA#####
        url = driver.current_url
        #print('URL1: ' + url)
        driver.quit()
        return url

Пример #13

0

Показать файл

Файл: mynewhits.py Проект: Jessik167/Scrapy

class mynewhits(scrapy.Spider):
    name = 'mynewhits'
    _num_pagina = 1
    start_urls = ['http://mynewhits.blogspot.com/']
    
    id_domin = 10
    nombre_trelacion = 'dominios'
    #####CREA OBJETO#####
    c = Controler(ModelSQLite(), View())
    
    
    def __init__(self, name=None, **kwargs):
        #####CREA TABLA#####
        self.c.crea_tabla(self.nombre_trelacion)
        if self.c.existe_id(self.id_domin, self.nombre_trelacion) == False:
        #####TOMA LA FECHA ACTUAL#####
            hoy = date.today().strftime("%d %B, %Y")
        #####INSERTA EN LA TABLA RELACIONAL#####
            self.c.inserta_item_relacional(self.id_domin, self.start_urls[0], hoy, self.nombre_trelacion)
            
            
    def parse(self, response):
        #####COMENTARIOS#####
        print('\n########Pagina ' + str(self._num_pagina) + '########')
        for div in response.css('div.date-posts > div'):
            for a in div.css('article > div > h2 > a'):
               titulo = a.css('::attr(title)').get()
               href = a.css('::attr(href)').get()
               cantante, album = self.separa_titulo(titulo, '-')
               yield scrapy.Request(href, callback= self.parse_attr, meta= {'referer': href, 'titulo': titulo, 'cantante': cantante, 'album': album})
               #break
            #break
        self._num_pagina+=1
        try:
            driver = webdriver.Chrome('C:\\Users\\APDIF\\Desktop\\chromedriver.exe')
            #print('URL:' + str(response.url))
            time.sleep(1)
            driver.get(response.url)
            driver.find_element_by_css_selector('div.pagenavi :last-child').click()
            next_page = driver.current_url
            #print('PAGINA SIG: ' + str(next_page))
            #time.sleep(5)
            if next_page is not None:
                driver.close()
                yield response.follow(next_page, callback= self.parse)
                return
        except:
             print('Hubo un problema al abrir la página siguiente')
        
    
    def parse_attr(self, response):
        fecha = response.css('div.post-body.entry-content > div > div.post-info-icon.tanggal > span ::text').get().strip()
        prev_inf = response.xpath('//*[@class="post-body entry-content"]/div[2]/div[3]/a/@href').get()
        infringing = open_adfly(prev_inf,'span[id*="skip_button"]')
        imprime_datos(response.meta['titulo'], fecha, response.meta['cantante'], response.meta['album'], response.meta['referer'], infringing)
        #####INSERTA EN BD#####
        if self.c.existe_inf(infringing, self.id_domin) == False:
            if infringing is not None:
                #if veri(infringing) == True:
                self.c.inserta_item(response.meta['titulo'], response.meta['cantante'], response.meta['album'], response.meta['referer'], infringing, fecha, self.id_domin)
        #####INSERTA EN BD#####
        if self.c.existe_inf(prev_inf, self.id_domin) == False:
            #if veri(prev_inf) == True:
            self.c.inserta_item(response.meta['titulo'], response.meta['cantante'], response.meta['album'], response.meta['referer'], prev_inf, fecha, self.id_domin)
        
        
    def separa_titulo(self, titulo, separador):
        t = titulo.split(separador)
        cantante, album = t[0], t[1]
        return cantante,album

Пример #14

0

Показать файл

class realmmusik(scrapy.Spider):
    name = 'realm-musik'
    _num_pagina = 1
    start_urls = ['https://realm-musik.blogspot.com/']
    
    id_domin = 3
    nombre_trelacion = 'dominios'
    #####CREA OBJETO#####
    c = Controler(ModelSQLite(), View())
    
    
    def __init__(self, name=None, **kwargs):
        #####CREA TABLA#####
        self.c.crea_tabla(self.nombre_trelacion)
        if self.c.existe_id(self.id_domin, self.nombre_trelacion) == False:
        #####TOMA LA FECHA ACTUAL#####
            hoy = date.today().strftime("%d %B, %Y")
        #####INSERTA EN LA TABLA RELACIONAL#####
            #id_trelacion = c.ultimo_id(self.nombre_trelacion)
            self.c.inserta_item_relacional(self.id_domin, self.start_urls[0], hoy, self.nombre_trelacion)
            
            
    def parse(self, response):
        link_inf = []
        
        #####COMENTARIOS#####
        print('\n########Pagina ' + str(self._num_pagina) + '########')
        
        ids = response.xpath('//*[@id="Blog1"]/div/div/div/div/div/meta[3]/@content').extract()
        
        
        for i in range(len(ids)):
            link_inf.append(str(response.xpath('//*[@id="post-body-'+ ids[i]+'"]/div[2]/a/@href').extract_first()))
            print('\nID: ' + str(link_inf[-1]))
        
        #####HACE UN LOOP POR PÁGINA EN BÚSCA DE LOS DATOS#####
        for div in response.css('div.blog-posts.hfeed'):
            for dd in div.css('div.date-outer'):
                fecha = dd.css('span ::text').get()
                i = 0
                for d in dd.css('div.post.hentry'):
                    referer = d.css('h3.post-title.entry-title > a ::attr(href)').get()
                    titulo = d.css('h3.post-title.entry-title > a ::text').get()
                    cantante, album = self.separa(titulo)
                    try:
                        yield scrapy.Request(url= link_inf[i], callback= self.parse_attr, meta= {'fecha': fecha, 'referer': referer, 'titulo': titulo, 'cantante': cantante, 'album': album})
                        i += 1
                    except:
                        pass
                    print('\n')
        
        #####PASA A LA SIGUIENTE PAGINA#####
        self._num_pagina+=1
        try:
            next_page = response.css('a.blog-pager-older-link ::attr(href)').get()
            #print('PAGINA SIGUIENTE:' + next_page)
            if next_page is not None:
                yield response.follow(next_page, callback= self.parse)
        except:
            print('Hubo un problema al abrir la página siguiente')
            
            
    
    
    def parse_attr(self, response):
        url = str(response.url)
        link_mega = open_adfly(url, 'skip_bu2tton')
        if link_mega is not None:
            link_mega = 'mega' + self.separaLink(link_mega)
        #return link_mega
            infringing = str(unquote(link_mega))
            print('\n*****************DATOS*****************')
            print('infringing: ' + infringing)
            print('fecha: ' + response.meta['fecha'])
            print('referer: ' + response.meta['referer'])
            print('titulo: '+ response.meta['titulo'])
            print('cantante: '+ response.meta['cantante'])
            print('album: '+ response.meta['album'])
            print('***************************************\n')
        
        #####INSERTA EN BD#####
            if veri(infringing) == True:
                if c.existe_inf(infringing, self.id_domin) == False:
                    self.c.inserta_item(response.meta['titulo'], response.meta['cantante'], response.meta['album'],response.meta['referer'], infringing, response.meta['fecha'], self.id_domin)
    
    
    def separaLink(self, link):
        #####SEPARA CANTANTE Y ALBUM#####
        separa = link.split('mega')
        return separa[1]
        
    
    def separa(self, titulo):
        #####SEPARA CANTANTE Y ALBUM#####
        separa = titulo.split('Link')
        s = separa[0].split('-')
        try:
            cantante, album = s[0], s[1]
        except:
            cantante = '-'
            album = '-'
        return cantante, album

Python ModelSQLite примеры использования