Пример #1
0
            m.append([i])
            mstr = str(i)
        d = []
        curpath = "/tmp/"
        os.chdir(curpath)
        for j in range(1, day):
            if (j < 10):
                d.append(["0"+str(j)])
                dstr = "0"+str(j)            
            else:
                d.append([j])
                dstr = str(j)
            url = "https://elpais.com/hemeroteca/elpais/2017/"+mstr+"/"+dstr+"/m/portada.html"
            print(url)
            path = mstr+"-"+dstr
            if (os.path.isdir(path)):
                print("Path is created")
            else:
                os.makedirs(path)
            os.chdir(path)
            
            c = Crawler(url)
            c.urlsLevelHost(2)
            for u in c.urls:
                caux = Crawler(u)
                faux = Formatter(u)
                name = faux.hostFromUrl() + str(time.time())
                caux.downloadOneUrlThread(name)
            os.chdir(curpath)

Пример #2
0
        d = []
#        print(os.getcwd())
        for j in range(1, day):
            if (j < 10):
                d.append(["0"+str(j)])
                dstr = "0"+str(j)            
            else:
                d.append([j])
                dstr = str(j)
            url = "https://elmundo.es/elmundo/hemeroteca/2017/"+mstr+"/"+dstr+"/m/index.html"
            print(url)
            logging.info(url)
            path = mstr+"-"+dstr
            if (os.path.isdir(path)):
                print("Path is created")
                logging.info('Path is created')
            else:
                os.makedirs(path)
            os.chdir(path)
            
            c = Crawler(url)
            c.urlsLevelHost(2)
            for u in c.urls:
                caux = Crawler(u)
                faux = Formatter(u)
                name = faux.hostFromUrl() + str(time.time()) + ".xml"
                logging.info(faux.hostFromUrl() + str(time.time()))
                caux.downloadOneUrlNewspaperThread(name)
            os.chdir(curpath)