Пример #1
0
class Discografiasmega(scrapy.Spider):
    name = 'Discografiasmega'
    _num_pagina = 1
    id_domin = 0
    start_urls = ['https://www.discografiasmega.com/']
    # RETORNA EL �LTIMO DOMINIO
    id_domin = retorna_dominio(start_urls[0])
    
    
    def parse(self, response):
        #####RECORRE "ARTICLES"#####
        for art in response.css('div.archive-main.archive-masonry  article'):
            referer = art.css('h2 > a ::attr(href)').get()
            yield Request(referer, meta= {'referer': referer},callback=self.parse_attr)
            
        next_page = response.css('a.next.page-numbers ::attr(href)').get()
        if next_page:
            yield response.follow(next_page, callback= self.parse)
            
                  
    def parse_attr(self, response):
        i = 0
        Fecha = date.today().strftime("%d %B, %Y")
        for T in response.xpath(".//strong[contains(text(), 'MEGA')]"):
            i += 1
            Titulo = self.Limpia_titulo(T.css('::text').get())
            Cantante, Album = separa_titulo(Titulo, '–')
            if i == 1:
                padre = T.xpath('..')
                Infringing = padre.css('a ::attr(href)').get()
                imprime_datos(Titulo,'', Cantante, Album, response.meta['referer'], Infringing)
                # INGRESA INFRINGING A LA BD LOS DATOS
                Inserta_Datos(Titulo, Cantante, Album, response.meta['referer'], Infringing, Fecha, self.id_domin)
            # INGRESA AL INFRINGING POR MEDIO DE SELENIUM
            self.Datos_Selenium(Titulo, Cantante, Album, response.meta['referer'], Infringing, Fecha, i)
            
            
    def Datos_Selenium(self,Titulo, Cantante, Album, Referer, Infringing, Fecha, i):
        driver = webdriver.Chrome('C:\\Users\\APDIF\\Desktop\\chromedriver.exe')
        driver.get(Infringing)
        Inf_short = driver.find_element_by_xpath("//div[@class='link-container']/a["+ str(i) +"]").text
        # SIGUE EL INFRINGING
        driver.get(Inf_short)
        # INGRESA INFRINGING A LA BD LOS DATOS
        imprime_datos(Titulo,'', Cantante, Album, Referer, Inf_short)
        # INGRESA INFRINGING A LA BD LOS DATOS
        Inserta_Datos(Titulo, Cantante, Album, Referer, Inf_short, Fecha, self.id_domin)
        Infringing_mega = Get_megaLink(driver)
        if Infringing_mega != False:
            # INGRESA INFRINGING A LA BD LOS DATOS
            imprime_datos(Titulo,'', Cantante, Album, Referer, Infringing_mega)
            # INGRESA INFRINGING A LA BD LOS DATOS
            Inserta_Datos(Titulo, Cantante, Album, Referer, Infringing_mega, Fecha, self.id_domin)
        driver.quit()
                    
                    
    def Limpia_titulo(self,Titulo):
        Titulo = Titulo.replace('Descargar','').replace('MEGA','')
        Titulo = Titulo.split('[')[0]
        return Titulo
Пример #2
0
class elmanualnlhc(scrapy.Spider):
    name = 'elmanualnlhc'
    _num_pagina = 1
    id_domin = 0
    start_urls = ['https://elmanualnlhc.wordpress.com/']

    id_domin = retorna_dominio(start_urls[0])

    def parse(self, response):
        #####RECORRE DIVS#####
        for div in response.css('div.narrowcolumn > div'):
            infringing = div.css('div >p > a ::text').extract_first()
            #####VERIFICA QUE INFRINGING CONTENGA TEXTO#####
            if infringing is not None:
                #####SI CONTIENE LA PALABRA DOWNLOAD#####
                if infringing.find('Download') != -1 or infringing.find(
                        'download') != -1:
                    #####TOMA EL RESTO DE LOS DATOS#####
                    Titulo = div.css('h2 ::attr(title)').get()
                    referer = div.css('h2 ::attr(href)').get()
                    Fecha = div.css('p.postmetadata ::text').get()
                    Fecha = Fecha.replace('\n\t\t\t', '')
                    #####TOMA EL INFRINGING#####
                    infringing = div.css(
                        'div >p > a ::attr(href)').extract_first()
                    #####VERIFICA QUE NO SEA UNA IMAGEN#####
                    if infringing.find('.png') == -1 or infringing.find(
                            '.jpg') == -1:
                        #####VERIFICA SI ES UN LINK VÁLIDO#####
                        if veri(infringing) == True:
                            #####LIMPIA EL TEXTO DEL TÍTULO#####
                            if Titulo is not None:
                                Titulo = Titulo.replace('\xa0', ' ')
                            #####SEPARA EL CANTANTE Y EL ALBUM DEL TÍTULO#####
                            Cantante, Album = separa_titulo(Titulo, '–')
                            print((Titulo, Cantante, Album, referer,
                                   infringing, Fecha))
                            #####SI NO EXISTE, LO INSERTA LOS DATOS EN LA BD#####
                            if c.existe_inf(infringing) == False:
                                c.inserta_item(Titulo, Cantante, Album,
                                               referer, infringing, Fecha,
                                               self.id_domin)
                                v.muestra_item_guardado(Titulo)

        #####PASA A LA SIGUIENTE PÁGINA#####
        self._num_pagina += 1
        try:
            next_page = 'https://elmanualnlhc.wordpress.com/page/{}/'.format(
                self._num_pagina)
            yield response.follow(next_page, callback=self.parse)
        except:
            pass
Пример #3
0
class playcorridos(scrapy.Spider):
    name = 'playcorridos'
    _num_pagina = 1
    id_domin = 0
    Fecha = date.today().strftime("%d %B, %Y")
    start_urls = ['http://playcorridos.com/']
    # RETORNA EL �LTIMO DOMINIO
    id_domin = retorna_dominio(start_urls[0])

    def parse(self, response):
        print('##### PÁGINA #{} #####'.format(self._num_pagina))
        #####RECORRE "ARTICLES"#####
        for art in response.css('article.item-list'):
            Titulo = art.css('h2 > a ::text').get()
            Titulo = self.Limpia_titulo(Titulo)
            Cantante, Album = separa_titulo(Titulo, '–')
            Titulo, Cantante, Album = self.acentos(Titulo, Cantante, Album)
            referer = art.css('h2 > a ::attr(href)').get()
            for a in art.css('p > a'):
                try:
                    Inf = a.css('::attr(href)').get()
                    if Inf.find('zippyshare') != -1:
                        Infringing = self.zippy(Inf)
                        imprime_datos(Titulo, '', Cantante, Album, referer,
                                      Infringing)
                        # INGRESA INFRINGING A LA BD LOS DATOS
                        Inserta_Datos(Titulo, Cantante, Album, referer,
                                      Infringing, self.Fecha, self.id_domin)
                    elif Inf.find('mediafire') != -1:
                        yield Request(Inf,
                                      meta={
                                          'Titulo': Titulo,
                                          'Cantante': Cantante,
                                          'Album': Album,
                                          'Referer': referer
                                      },
                                      callback=self.mediaFire)
                    elif Inf.find('userscloud') != -1:
                        Infringing = self.userCloud(Inf)
                        imprime_datos(Titulo, '', Cantante, Album, referer,
                                      Infringing)
                        # INGRESA INFRINGING A LA BD LOS DATOS
                        Inserta_Datos(Titulo, Cantante, Album, referer,
                                      Infringing, self.Fecha, self.id_domin)
                except:
                    continue
            #break

        next_page = response.css('span#tie-next-page > a ::attr(href)').get()
        if next_page:
            self._num_pagina += 1
            yield response.follow(next_page, callback=self.parse)

    def userCloud(self, Inf):
        driver = webdriver.Chrome(
            'C:\\Users\\APDIF\\Desktop\\chromedriver.exe')
        driver.get(Inf)
        time.sleep(2)
        try:
            element = driver.find_element_by_css_selector(
                "button.btn.btn-inverse.btn-icon-stacked")
            driver.execute_script("arguments[0].click();", element)
        except:
            pass
        time.sleep(3)
        element = driver.find_element_by_css_selector("button#btn_download")
        driver.execute_script("arguments[0].click();", element)
        time.sleep(1)
        sel = driver.execute_script(
            "return document.getElementsByClassName('ribbon-heading ribbon-default top-left-right')[0].innerHTML;"
        )
        Infringing = self.get_atr(sel).strip()
        #print(Infringing)
        driver.quit()
        return Infringing

    def zippy(self, Inf):
        driver = webdriver.Chrome(
            'C:\\Users\\APDIF\\Desktop\\chromedriver.exe')
        driver.get(Inf)
        time.sleep(0.5)
        Infringing = driver.find_element_by_id("dlbutton").get_attribute(
            'href')
        driver.quit()
        return Infringing

    def mediaFire(self, response):
        Infringing = response.css(
            'a.input.popsok ::attr(href)').extract_first().strip()
        imprime_datos(response.meta['Titulo'], '', response.meta['Cantante'],
                      response.meta['Album'], response.meta['Referer'],
                      Infringing)
        # INGRESA INFRINGING A LA BD LOS DATOS
        Inserta_Datos(response.meta['Titulo'], response.meta['Cantante'],
                      response.meta['Album'], response.meta['Referer'],
                      Infringing, self.Fecha, self.id_domin)

    def get_atr(self, texto):
        if texto:
            texto = texto.split('onclick')[0]
            texto = texto.split('=')[1].replace('"', '')
        return texto

    def Limpia_titulo(self, Titulo):
        if Titulo:
            Titulo = Titulo.split('(')[0]
        return Titulo

    def get_Album(self, Texto):
        Texto = Texto.strip()
        try:
            Album = Texto.split('–')[1]
            return Album
        except:
            return Texto

    def acentos(self, Titulo, Cantante, Album):
        Titulo = strip_accents(Titulo)
        Cantante = strip_accents(Cantante)
        Album = strip_accents(Album)
        return Titulo, Cantante, Album
Пример #4
0
class discografiaspormega(scrapy.Spider):
    name = 'discografiaspormega'
    _num_pagina = 1
    id_domin = 0
    start_urls = ['https://www.discografiaspormega.com/']
    # RETORNA EL �LTIMO DOMINIO
    id_domin = retorna_dominio(start_urls[0])

    def parse(self, response):
        print('##### PÁGINA #{} #####'.format(self._num_pagina))
        #####RECORRE "ARTICLES"#####
        for art in response.css('div#content article'):
            referer = art.css('a ::attr(href)').get()
            #print(referer)
            yield Request(referer,
                          meta={'referer': referer},
                          callback=self.parse_attr)

        next_page = response.css('a.next.page-numbers ::attr(href)').get()
        if next_page:
            self._num_pagina += 1
            yield response.follow(next_page, callback=self.parse)

    def parse_attr(self, response):
        #print(response.text)
        i = 0
        Titulos = []
        Fecha = date.today().strftime("%d %B, %Y")
        driver = webdriver.Chrome(
            'C:\\Users\\APDIF\\Desktop\\chromedriver.exe')
        driver.get(response.url)
        for str in driver.find_elements_by_xpath('//p/strong'):
            i += 1
            if i % 2 != 0:
                Titulos.append(str.text)
                #print(str.text)
            else:
                continue
        i = 0
        tam = len(Titulos)
        for a in driver.find_elements_by_css_selector('p > a'):
            #print(i)
            if i == 0:
                i += 1
                continue
            if i == tam:
                break
            inf = a.get_attribute('href')
            driver.execute_script("window.open(arguments[0]);", inf)
            driver.switch_to.window(driver.window_handles[1])
            time.sleep(1)
            Infringing_mega = driver.current_url
            driver.close()
            driver.switch_to.window(driver.window_handles[0])
            Titulo = self.Limpia_titulo(Titulos[i])
            Cantante, Album = separa_titulo(Titulo, '–')
            imprime_datos(Titulo, '', Cantante, Album, response.url,
                          Infringing_mega)
            # INGRESA INFRINGING A LA BD LOS DATOS
            Inserta_Datos(Titulo, Cantante, Album, response.url,
                          Infringing_mega, Fecha, self.id_domin)
            i += 1
        driver.quit()

    def Limpia_titulo(self, Titulo):
        Titulo = Titulo.replace('Descargar', '').replace('MEGA', '')
        Titulo = Titulo.split('[')[0]
        return Titulo
Пример #5
0
            driver.quit()
            return False


def busca_todas_categorias(driver):
    for a in driver.find_elements_by_css_selector('ul.sub-menu > li > a'):
        cate_ref = a.get_attribute('href')
        driver.execute_script("window.open(arguments[0]);", cate_ref)
        driver.switch_to.window(driver.window_handles[1])
        extrae_categoria(driver)
    driver.quit()


def busca_por_categoria(driver, cate_ref):
    driver.execute_script("window.open(arguments[0]);", cate_ref)
    driver.switch_to.window(driver.window_handles[1])
    extrae_categoria(driver)
    driver.quit()


fecha = date.today().strftime("%d %B, %Y")
url = 'https://www.barboflacmusic.com/'
#####TOMA EL ÚLTIMO ID DE LA TABLA DE DOMINIOS EN LA BD#####
id_domin = retorna_dominio(url)
#####ABRE NAVEGADOR#####
driver = webdriver.Chrome('C:\\Users\\APDIF\\Desktop\\chromedriver.exe')
driver.get(url)
#print(get_mega('https://ouo.io/VlrKab'))
#cate_ref = 'https://www.barboflacmusic.com/category/salsa/page/{}/'.format(pag)
busca_por_categoria(driver,
                    'https://www.barboflacmusic.com/category/vallenato/')
Пример #6
0
class mp3teca(scrapy.Spider):
    name = 'mp3teca'
    _num_pagina = 2
    id_domin = 0
    start_urls = ['https://mp3teca.com/mp3s/']
    inf_url = 'http://yyy-music.com/d/'
    hoy = date.today().strftime("%d %B, %Y")

    custom_settings = {'CONCURRENT_REQUESTS': 10, 'DOWNLOAD_DELAY': 0.8}

    id_domin = retorna_dominio(start_urls[0])

    def parse(self, response):
        for li in response.css('div#content > div ul > li'):
            referer = li.css('a ::attr(href)').get()
            Titulo = li.css('a ::text').get()
            Titulo = self.give_emoji_free_text(Titulo)
            #print(Titulo)
            Cantante, Cancion = separa_titulo(Titulo, '–')
            Cantante = self.give_emoji_free_text(Cantante)
            id = self.get_id(referer)
            url = self.inf_url + id
            yield Request(url,
                          meta={
                              'referer': referer,
                              'Titulo': Titulo,
                              'Cantante': Cantante,
                              'Cancion': Cancion,
                              'Fecha': self.hoy
                          },
                          callback=self.parse_attr)
        #####PASA A LA SIGUIENTE PÁGINA#####
        self._num_pagina += 1
        try:
            next_page = 'https://mp3teca.com/mp3s/page/{}/'.format(
                self._num_pagina)
            yield response.follow(next_page, callback=self.parse)
        except:
            pass

    def parse_attr(self, response):
        infringing = response.css('a.btn-nwo ::attr(href)').get()
        #infringing = str(infringing,'utf-8')
        #infringing = self.give_emoji_free_text(infringing)
        #####VERIFICA SI ES UN LINK VÁLIDO#####
        if veri(infringing) == True:
            if c.existe_ref(response.meta['referer']) == False:
                imprime_datos(response.meta['Titulo'], response.meta['Fecha'],
                              response.meta['Cantante'],
                              response.meta['Cancion'],
                              response.meta['referer'], infringing)
                if c.inserta_item(response.meta['Titulo'],
                                  response.meta['Cantante'],
                                  response.meta['Cancion'],
                                  response.meta['referer'], infringing,
                                  response.meta['Fecha'],
                                  self.id_domin) == True:
                    v.muestra_item_guardado(response.meta['Titulo'])

    def give_emoji_free_text(self, text):
        allchars = [str for str in text]
        emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
        clean_text = ' '.join([
            str for str in text.split()
            if not any(i in str for i in emoji_list)
        ])
        return clean_text

    def get_id(self, ref):
        ref = ref.split('/')
        return ref[4]
Пример #7
0
class musiconworldoffmx(scrapy.Spider):
    name = 'musiconworldoffmx'
    _num_pagina = 1
    id_domin = 0
    start_urls = ['http://musiconworldoffmx.com/']
    # RETORNA EL �LTIMO DOMINIO
    id_domin = retorna_dominio(start_urls[0])

    def parse(self, response):
        print('##### PÁGINA #{} #####'.format(self._num_pagina))
        #####RECORRE "ARTICLES"#####
        for art in response.css('div.entry-content a'):
            #print(art.get())
            referer = art.css('a ::attr(href)').get()
            yield Request(referer, callback=self.parse_attr)

        next_page = response.css('div.nav-previous > a ::attr(href)').get()
        if next_page:
            self._num_pagina += 1
            yield response.follow(next_page, callback=self.parse)

    def parse_attr(self, response):
        #print(response.text)
        try:
            Titulo = response.css('h2.entry-title > a ::text').get()
            Cantante = self.Limpia_titulo(Titulo)
            Fecha = response.css('div.entry-meta ::text').extract()[0].strip(
            ) + ' ' + response.css(
                'div.entry-meta ::text').extract()[1].strip()
            Fecha = Fecha.strip()
            for alb in response.css('div.entry-content > p a'):
                Album = alb.css('::text').get()
                Album = self.get_Album(Album)
                Inf = alb.css('::attr(href)').get()
                imprime_datos(Titulo, Fecha, Cantante, Album, response.url,
                              Inf)
                # INGRESA INFRINGING A LA BD LOS DATOS
                Inserta_Datos(Titulo, Cantante, Album, response.url, Inf,
                              Fecha, self.id_domin)
            driver = webdriver.Chrome(
                'C:\\Users\\APDIF\\Desktop\\chromedriver.exe')
            driver.get(Inf)
            Infringing_mega = Get_megaLink(driver)
            imprime_datos(Titulo, Fecha, Cantante, Album, response.url,
                          Infringing_mega)
            # INGRESA INFRINGING A LA BD LOS DATOS
            Inserta_Datos(Titulo, Cantante, Album, response.url,
                          Infringing_mega, Fecha, self.id_domin)
        except:
            pass

    def Limpia_titulo(self, Titulo):
        if Titulo:
            Titulo = Titulo.replace('Discografia',
                                    '').replace('Discograifa',
                                                '').replace('MEGA', '')
            Titulo = Titulo.split('(')[0]
            try:
                Titulo = Titulo.split('(')[0].strip()
            except:
                pass
        return Titulo

    def get_Album(self, Texto):
        Texto = Texto.strip()
        try:
            Album = Texto.split('–')[1]
            return Album
        except:
            return Texto