示例#1
0
class DownloadItEbook(threading.Thread):
    '''
    This class will download books from itebook.info
    '''
    def __init__(self,
                 group=None,
                 target=None,
                 name=None,
                 args=(),
                 kwargs=None,
                 verbose=None):
        '''
        Constructor, setting location of downloaded book.
        '''
        super(DownloadItEbook, self).__init__(group=group,
                                              target=target,
                                              name=name,
                                              verbose=verbose)

        self.args = args
        self.kwargs = kwargs
        self.directory_name = Workspace().libraryPath
        self.createDatabase = CreateDatabase()
        pass

    def run(self):
        print('running with %s and %s', self.args, self.kwargs)
        return

    def getUrl(self, baseUrl, number):
        '''this method will find and constuct all url of url given'''
        return baseUrl + '/book/' + str(number)

    def findBookDetail(self, baseUrl, number):
        ''' This method will download book cover.
         It will provide book object.'''
        url = self.getUrl(baseUrl, number)
        content = urllib2.urlopen(url).read()
        soup = BeautifulSoup(content)
        book = Book()
        book.authors.append(Author(soup.find_all(itemprop="author")[0].text))
        book.isbn_13 = soup.find_all(itemprop="isbn")[0].text
        book.bookName = soup.find_all(itemprop="name")[0].text
        book.publisher = soup.find_all(itemprop="publisher")[0].text

        try:
            date = datetime.strptime(
                str(soup.find_all(itemprop="datePublished")[0].text), '%Y')
        except:
            date = datetime.now()
        book.publishedOn = date

        book.numberOfPages = soup.find_all(itemprop="numberOfPages")[0].text
        book.inLanguage = soup.find_all(itemprop="inLanguage")[0].text
        book.bookFormat = soup.find_all(itemprop="bookFormat")[0].text
        book.bookDescription = soup.find_all(itemprop="description")[0].text
        book.bookImgName = (soup.find_all(itemprop="image")[0]).get('src')
        try:
            book.subTitle = soup.h3.text
        except:
            traceback.print_exc()
        book.fileSize = soup.find_all('table')[3].find_all('tr')[7].find_all(
            'td')[1].find_all('b')[0].text
        #         book.fileSize=

        #         .top > div:nth-child(2) > h3:nth-child(2)

        for link in soup.find_all('a'):
            if link.get('href').startswith('http://filepi.com'):
                book.name = link.text
                break
        return book

    def getMaxBookID(self):
        maxBookId = self.createDatabase.getMaxBookID()
        if not maxBookId:
            maxBookId = 0
        return maxBookId

    def downloadDir(self):
        '''
        This function will create directory to download book.
        @param number:it takes database maxId+1 to create new directory . 
        '''
        directory_name = os.path.join(self.directory_name,
                                      str(self.getMaxBookID() + 1))
        if not os.path.exists(directory_name):
            os.makedirs(directory_name)
        return directory_name

    def firefoxDownloadJob(self, book, baseUrl, number):
        '''The function of this method is to download link of given URL.'''
        directory_name = self.downloadDir()
        # Creating Actual URL
        url = self.getUrl(baseUrl, number)
        if not os.path.exists(directory_name):
            os.makedirs(directory_name)

        lsFiles = []
        # Checking if there are three files in this URL.
        # Creating a list of absolute files.
        if 3 == len(os.listdir(directory_name)):
            for sName in os.listdir(directory_name):
                if os.path.isfile(os.path.join(directory_name, sName)):
                    lsFiles.append(sName)

        # Checking if there are more than 3 files in the directory location.
        # Removing all the files from direcotry.
        elif 3 != len(os.listdir(directory_name)):
            for sName in os.listdir(directory_name):
                os.remove(directory_name + '/' + sName)

            imageUrl = url + book.bookImgName
            subUrl = book.bookImgName
            imageFileName = subUrl.split('/')[-1:][0]
            logging.info(imageUrl)

            # Downloading book cover
            bookImagePath = os.path.join(directory_name,
                                         subUrl.split('/')[-1:][0])
            urllib.urlretrieve(baseUrl + book.bookImgName, bookImagePath)
            book.bookImgName = imageFileName
            f = open(os.path.join(directory_name, 'book.json'), 'w')
            row2dict = book.__dict__
            authors = []
            if type(row2dict['publishedOn']) == datetime:
                row2dict['publishedOn'] = str(row2dict['publishedOn'])
            for a in row2dict['authors']:
                author = {}
                if type(a) == str:
                    author['authorName'] = a
                else:
                    author = a.__dict__

                authors.append(author)
            row2dict['authors'] = authors
            f.write(json.dumps(row2dict, sort_keys=False, indent=4))
            f.close()

            fp = webdriver.FirefoxProfile()

            fp.set_preference("browser.download.folderList", 2)
            fp.set_preference('browser.download.manager.showWhenStarting',
                              False)
            fp.set_preference('browser.download.manager.focusWhenStarting',
                              False)
            fp.set_preference("browser.download.dir", directory_name)
            fp.set_preference("browser.download.manager.scanWhenDone", False)
            fp.set_preference("browser.download.manager.useWindow", False)
            fp.set_preference("browser.helperApps.neverAsk.saveToDisk",
                              "application/octet-stream")
            fp.update_preferences()
            driver = webdriver.Firefox(firefox_profile=fp)
            # driver.find_element_by_xpath("html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[1]/img")
            driver.get(url)
            efd_link = driver.find_element_by_link_text(book.name)
            book.fileSize = driver.find_element_by_xpath(
                "html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[2]/table/tbody/tr[8]/td[2]/b"
            ).text
            book.bookFormat = driver.find_element_by_xpath(
                "html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[2]/table/tbody/tr[9]/td[2]/b"
            ).text
            efd_link.click()
            flag = True
            while (flag):
                # # checking part file
                time.sleep(10)
                lst = []
                files = []
                for sName in os.listdir(directory_name):
                    if os.path.isfile(os.path.join(directory_name, sName)):
                        logging.info(sName.split('.')[-1:][0])
                        lst.append(sName.split('.')[-1:][0])
                        files.append(os.path.join(directory_name, sName))
                print lst
                if 'part' not in lst:
                    logging.info("flag :" + str(flag))
                    flag = False
                    time.sleep(10)
                    driver.close()
                else:
                    # print files
                    #                     if not self.isBookDownloading(files):
                    #                         driver.close()
                    pass

    def writeJsonToDir(self, bookPath=None, book=None):
        '''
        this function will write json file to given dir.
        '''
        try:
            f = open(os.path.join(bookPath, 'book.json'), 'w')
            row2dict = book.__dict__
            authors = []
            if type(row2dict['publishedOn']) == datetime:
                row2dict['publishedOn'] = str(row2dict['publishedOn'])
            for a in row2dict['authors']:
                author = {}
                if type(a) == str:
                    author['authorName'] = a
                else:
                    author = a.__dict__

                authors.append(author)
            row2dict['authors'] = authors
            f.write(json.dumps(row2dict, sort_keys=False, indent=4))
            f.close()
        except:
            traceback.print_exc()

    def isBookDownloading(self, files):
        ''' This method will inform that book is getting downloading or not.'''
        # time.sleep(2)
        dic_files = {}
        time_dic_files = {}
        i = 1
        checkFlagForSize = True
        isDownloading = True
        for fl in files:
            dic_files[fl] = str(os.stat(fl).st_size)
        while (checkFlagForSize):

            time_dic_files[i] = dic_files
            i = i + 1
            if i > 4:
                size = set()
                for k in time_dic_files[i - 1]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 1][k])
                for k in time_dic_files[i - 2]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 2][k])
                for k in time_dic_files[i - 3]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 3][k])
#                 print len(list(size))
                if len(list(size)) > 1:
                    isDownloading = False
            checkFlagForSize = False
        logging.info('isDownloading:')
        return isDownloading

    def startDownload(self):
        baseUrl = 'http://it-ebooks.info'
        miss = Missing()
        #         lst = miss.missingNumbers()
        lst = [1464348534, 7102]
        for number in lst:
            print number
            #         for number in range(6998, 0, -1):
            itebook = DownloadItEbook()
            url = itebook.getUrl(baseUrl, number)
            a = urllib2.urlopen(url)
            strig = a.geturl()
            if '404' != strig[-4:-1]:
                book = itebook.findBookDetail(baseUrl, number)
                # Is this book already availble (downloaded)
                # check book whethere it is existing in database.
                bs = FindingBook().findBookByIsbn(isbn_13=book.isbn_13)
                if bs:
                    print 'this books is already present.', book.isbn_13, book.bookName
                else:
                    try:
                        self.firefoxDownloadJob(book, baseUrl, number)
                        self.updateDatabase()
                    except:
                        print number, baseUrl
                        traceback.print_exc()


#                 try:
#                     thread.start_new_thread( self.updateDatabase, ())
#                 except:
#                     traceback.print_exc()

#                 logging.info("checking  Is this book already availble (downloaded)" + book.bookName)

    def updateDatabase(self):
        self.createDatabase.creatingDatabase()
        self.createDatabase.addingData()

    def updateBooksMetadata(self):
        miss = Missing()
        listOfDir = miss.availableNumbers()
        listOfDir = listOfDir[1391:]
        baseUrl = 'http://it-ebooks.info'
        for number in listOfDir:
            print '------------------->', number
            #             url = self.getUrl(baseUrl, number)
            #             a = urllib2.urlopen(url)
            #             strig = a.geturl()
            #             if  '404' != strig[-4:-1]:
            #             number=7102
            #         genUrl=self.downloadItEbook.getUrl(baseUrl, number)
            try:
                book = self.findBookDetail(baseUrl, number)
                book.itEbookUrlNumber = number
                subUrl = book.bookImgName
                imageFileName = subUrl.split('/')[-1:][0]
                book.bookImgName = imageFileName
                bookPath = os.path.join(Workspace().libraryPath, number)
                self.writeJsonToDir(bookPath, book)
            except:
                traceback.print_exc()
示例#2
0
class ItEbook(object):
    '''
    This class downloads first page of itebookinfo
    '''
    def __init__(self, baseUrl=None):
        '''
        Constructor
        '''
        self.baseUrl = baseUrl
        self.directory_name = Workspace().libraryPath
        self.createDatabase = CreateDatabase()
        self.header_info = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0'
        }

        # book image url
        self.imageUrl = None
        self.bookUrl = None
        pass

    def getUrl(self, baseUrl):
        '''this method will find and constuct all url of url given'''
        return self.baseUrl

    def findAllBookUrl(self, subUrl=None):
        '''
        This method retrive all the book url avaialbe in the page.
        http://itebooks.website/page-2.html
        '''
        url = self.baseUrl + '/' + subUrl
        #         print url
        #         content = urllib2.urlopen(url).read()
        r = requests.get(url, headers=self.header_info, timeout=30)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "lxml")

            skipList = [
                'HOME', 'Category', 'Animals', 'Architecture', 'Art',
                'Astronomy', 'Biography', 'Biology', 'Business', 'Chemistry',
                'Cinema', 'Cookbooks', 'Cryptography', 'Culture', 'Design',
                'Drawing', 'Economics', 'Encyclopedia and Dictionary',
                'Engineering and Technology', 'Family and Friendship',
                'Fitness', 'Gambling', 'Games', 'Hardware', 'Healthcare',
                'History', 'Hobbies', 'Information Technologies', 'IT ebooks',
                'Languages', 'Martial Arts', 'Mathematics', 'Medicine',
                'Military', 'Music', 'Novels', 'Other', 'Personality',
                'Philosophy', 'Photo', 'Physics', 'Poetry',
                'Politics and Sociology', 'Programming', 'Psychology',
                'Relationships', 'Religion', 'Science', 'Security',
                'Sexuality', 'Software', 'Sport', 'Travel', 'Web Development'
            ]
            #             with open(os.path.dirname(__file__) + os.sep + 'skipList.txt', 'r') as f:
            #                 for line in f:
            #                     skipList.append(line.rstrip('\n'))
            #                 f.close
            listOfBookName = list()
            for link in soup.find_all('a', 'title'):
                if link.text.strip() != '' and link.text not in skipList:

                    listOfBookName.append(link.text)

                    isBookAvailable = self.isBookNameAvailableInDatabase(
                        link.text)
                    #                     self.isIsbnAvailableInDatabase()
                    #                     print isBookAvailable, link.text
                    if not isBookAvailable:
                        #                         print link.text, '\t', link.get('href'), isBookAvailable
                        book, bookUrl = self.findBookDetail(link.get('href'))
                        isBookAvailable = self.isIsbnAvailableInDatabase(
                            book.isbn_13)
                        #                     print book
                        if not isBookAvailable:
                            try:
                                print 'uploading database'
                                directory_name = self.downloadEbook(
                                    book, link.get('href'), bookUrl)
                                self.updateDatabase(directory_name)
                            except:
                                print link.get('href')
                                traceback.print_exc()

    def updateDatabase(self, directory_name):
        #         self.createDatabase.creatingDatabase()
        #         self.createDatabase.addingData()
        self.createDatabase.addSingleBookData(directory_name)

    def isIsbnAvailableInDatabase(self, isbn_13=None):
        isBookPresent = False
        book = self.createDatabase.findByIsbn_13Name(isbn_13)
        if book:
            isBookPresent = True
        return isBookPresent

    def isBookNameAvailableInDatabase(self, bookName=None):
        isBookPresent = False
        book = self.createDatabase.findByBookName(bookName)
        if book:
            isBookPresent = True
        return isBookPresent

    def findBookDetail(self, subUrl):
        ''' This method will download book cover.
        It will provide book object.
        http://www.ebook777.com/shut-youre-welcome/
         '''
        book = None
        #         url=self.baseUrl+'/'+subUrl
        url = subUrl
        r = requests.get(url, headers=self.header_info, timeout=30)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "lxml")

            book = Book()
            book.bookDescription = soup.find(id="main-content-inner").p.text
            book.bookName = soup.find(id="main-content-inner").find(
                class_='article-details').find(class_='title').text
            book.subTitle = soup.find(id="main-content-inner").find(
                class_='article-details').find(class_='subtitle').text
            bookUrl = soup.find(id="main-content-inner").find(
                class_='download-links').find('a')['href']
            table_body = soup.find('table')
            rows = table_body.find_all('tr')
            for row in rows:
                cols = row.find_all('td')
                if len(cols) == 3:
                    book.bookImgName = cols[0].img.attrs['alt']
                    self.imageUrl = cols[0].img.attrs['src']
                    if cols[1].text == 'Author':
                        #                         print cols[2].text
                        author = Author()
                        author.authorName = cols[2].text
                        book.authors.append(author)
#                         book.authors.append()

                if len(cols) == 2:
                    if cols[0].text == 'File size':
                        book.fileSize = cols[1].text
                    if cols[0].text == 'Year':
                        try:
                            date = datetime.strptime(cols[1].text, '%Y')
                        except:
                            date = datetime.now()
                        book.publishedOn = date
                    if cols[0].text == 'Pages':
                        book.numberOfPages = cols[1].text
                    if cols[0].text == 'Language':
                        book.inLanguage = cols[1].text
                    if cols[0].text == 'File format':
                        book.bookFormat = cols[1].text
                    if cols[0].text == 'Category':
                        book.tag = cols[1].text
                    if cols[0].text == 'File format':
                        book.bookFormat = cols[1].text
                    if cols[0].text == 'Isbn':
                        book.isbn_13 = cols[1].text

#                 print cols

        return book, bookUrl

    def downloadEbook(self, book, refUrl, bookUrl):
        directory_name = self.downloadDir()
        url = refUrl
        bookImagePath = os.path.join(directory_name, book.bookImgName)
        self.downloadBookImage(bookImagePath, self.imageUrl)
        self.writeJsonToDir(directory_name, book)

        r = requests.get(bookUrl, headers=self.header_info, timeout=30)
        print '--------------->', r.url
        bookPath = os.path.join(directory_name, bookUrl.split('/')[-1])
        with open(bookPath, 'wb') as bookFile:
            bookFile.write(r.content)
        try:
            self.extractRar(directory_name)
        except:
            traceback.print_exc()
            pass
        return directory_name

    def firefoxDownloadJob(self, book, refUrl):
        '''The function of this method is to download link of given URL.'''
        # Creating directory
        directory_name = self.downloadDir()

        # Creating Actual URL
        #         url = self.baseUrl+refUrl
        url = refUrl

        lsFiles = []

        # Checking if there are three files in this URL.
        # Creating a list of absolute files.
        if 3 == len(os.listdir(directory_name)):
            for sName in os.listdir(directory_name):
                if os.path.isfile(os.path.join(directory_name, sName)):
                    lsFiles.append(sName)

        # Checking if there are more than 3 files in the directory location.
        # Removing all the files from direcotry.
        elif 3 != len(os.listdir(directory_name)):
            for sName in os.listdir(directory_name):
                os.remove(directory_name + '/' + sName)

            # Downloading book cover
            bookImagePath = os.path.join(directory_name, book.bookImgName)
            self.downloadBookImage(bookImagePath, self.imageUrl)

            # writing json file
            self.writeJsonToDir(directory_name, book)
            binary = FirefoxBinary('/docs/python_projects/firefox/firefox')

            fp = webdriver.FirefoxProfile()

            fp.set_preference("webdriver.log.file", "/tmp/firefox_console")
            fp.set_preference("browser.download.folderList", 2)
            fp.set_preference('browser.download.manager.showWhenStarting',
                              False)
            fp.set_preference('browser.download.manager.focusWhenStarting',
                              False)
            fp.set_preference("browser.download.dir", directory_name)
            fp.set_preference("browser.download.manager.scanWhenDone", False)
            fp.set_preference("browser.download.manager.useWindow", False)
            #             fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream")
            fp.set_preference(
                "browser.helperApps.neverAsk.saveToDisk",
                "application/octet-stream,application/xml,application/pdf,text/plain,text/xml,image/jpeg,text/csv,application/zip,application/x-rar-compressed"
            )
            fp.set_preference("browser.helperApps.alwaysAsk.force", False)
            fp.set_preference("browser.popups.showPopupBlocker", False)
            fp.update_preferences()
            driver = webdriver.Firefox(firefox_profile=fp,
                                       firefox_binary=binary)
            # driver.find_element_by_xpath("html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[1]/img")
            driver.get(url)
            efd_link = driver.find_element_by_css_selector(
                ".download-links > a:nth-child(1)")
            efd_link.click()

            #             efd_link.send_keys(Keys.RETURN)
            flag = True
            while (flag):
                # # checking part file
                time.sleep(10)
                lst = []
                files = []
                for sName in os.listdir(directory_name):
                    if os.path.isfile(os.path.join(directory_name, sName)):
                        lst.append(sName.split('.')[-1:][0])
                        files.append(os.path.join(directory_name, sName))
#                 print lst
                if 'part' not in lst:
                    flag = False
                    time.sleep(10)
                    driver.close()
                else:
                    # print files
                    #                     if not self.isBookDownloading(files):
                    #                         driver.close()
                    pass
        self.extractRar(directory_name)

    def downloadBookImage(self, bookImagePath=None, imageUrl=None):
        '''
        this method will download image from imageUrl location and keep it at bookImagePath
        '''
        from PIL import Image
        from StringIO import StringIO
        r = requests.get(imageUrl, headers=self.header_info, timeout=30)
        print '--------------->', r.url
        with open(bookImagePath, 'wb') as imageFile:
            imageFile.write(r.content)

    def writeJsonToDir(self, bookPath=None, book=None):
        '''
        this function will write json file to given dir.
        '''
        try:
            f = open(os.path.join(bookPath, 'book.json'), 'w')
            row2dict = book.__dict__
            authors = []
            if type(row2dict['publishedOn']) == datetime:
                row2dict['publishedOn'] = str(row2dict['publishedOn'])
            for a in row2dict['authors']:
                author = {}
                if type(a) == str:
                    author['authorName'] = a
                else:
                    author = a.__dict__

                authors.append(author)
            row2dict['authors'] = authors
            if not row2dict['isbn_13'] == None:
                if str(row2dict['isbn_13']).strip() == '':
                    row2dict['isbn_13'] = None
            f.write(json.dumps(row2dict, sort_keys=False, indent=4))
            f.close()
        except:
            traceback.print_exc()

    def isBookDownloading(self, files):
        ''' This method will inform that book is getting downloading or not.'''
        # time.sleep(2)
        dic_files = {}
        time_dic_files = {}
        i = 1
        checkFlagForSize = True
        isDownloading = True
        for fl in files:
            dic_files[fl] = str(os.stat(fl).st_size)
        while (checkFlagForSize):

            time_dic_files[i] = dic_files
            i = i + 1
            if i > 4:
                size = set()
                for k in time_dic_files[i - 1]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 1][k])
                for k in time_dic_files[i - 2]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 2][k])
                for k in time_dic_files[i - 3]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 3][k])
#                 print len(list(size))
                if len(list(size)) > 1:
                    isDownloading = False
            checkFlagForSize = False
        logging.info('isDownloading:')
        return isDownloading

    def startDownload(self):
        #         baseUrl = 'http://itebooks.website'
        #         baseUrl = 'http://it-ebooks.directory'
        baseUrl = 'http://www.ebook777.com'
        itebook = ItEbook(baseUrl)
        # TODO need to be updated
        logicTrue = True
        i = 1100
        while logicTrue:
            subUrl = 'page/' + str(i) + '/'
            itebook.findAllBookUrl(subUrl)
            i = i + 1
            print 'startDownload---------->', str(i)


#             if i==4:
#                 break

    def getMaxBookID(self):
        maxBookId = self.createDatabase.getMaxBookID()
        if not maxBookId:
            maxBookId = 0
        return maxBookId

    def downloadDir(self):
        '''
        This function will create directory to download book.
        @param number:it takes database maxId+1 to create new directory . 
        '''
        directory_name = os.path.join(self.directory_name,
                                      str(self.getMaxBookID() + 1))
        if not os.path.exists(directory_name):
            os.makedirs(directory_name)
            os.chdir(directory_name)
        return directory_name

    def extractRar(self, directory_name):
        '''
        extracting rar file
        '''
        os.chdir(directory_name)
        #         directory_name = '/docs/new/library/8006'
        listOfFiles = [
            name for name in os.listdir(directory_name)
            if not os.path.isdir(os.path.join(directory_name, name))
        ]
        for fileName in listOfFiles:
            if fileName.endswith(".rar"):
                #                 print fileName
                directory_name
                rar = rarfile.RarFile(os.path.join(directory_name, fileName))
                #                 print rar.namelist()
                infoList = rar.infolist()
                nameList = rar.namelist()
                for name in nameList:
                    if not ((name.endswith('.html')) or
                            (name.endswith('.htm')) or
                            (name.endswith('.txt'))):
                        rar.extract(name, directory_name)
        pass
示例#3
0
class ItEbook(object):
    '''
    This class downloads first page of itebookinfo
    '''
    def __init__(self, baseUrl=None):
        '''
        Constructor
        '''
        self.baseUrl = baseUrl
        self.directory_name = Workspace().libraryPath
        self.createDatabase = CreateDatabase()
        self.header_info = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0'
        }
        pass

    def getUrl(self, baseUrl):
        '''this method will find and constuct all url of url given'''
        return self.baseUrl

    def findAllBookUrl(self, subUrl=None):
        '''
        This method retrive all the book url avaialbe in the page.
        http://itebooks.website/page-2.html
        '''
        url = self.baseUrl + '/' + subUrl
        print url
        #         content = urllib2.urlopen(url).read()
        r = requests.get(url, headers=self.header_info, timeout=30)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "lxml")

            skipList = (u'\nCategories', u'\nContact', u'\nUpload',
                        u'\nDonate', u'IT eBooks', u'Prev', u'Next')
            listOfBookName = list()
            for link in soup.find_all('a'):
                if link.text.strip() != '' and link.text not in skipList:
                    listOfBookName.append(link.text)

                    isBookAvailable = self.isBookNameAvailableInDatabase(
                        link.text)
                    if not isBookAvailable:
                        print link.text, '\t', link.get(
                            'href'), isBookAvailable
                        book = self.findBookDetail(link.get('href'))
                        #                     print book
                        try:
                            print 'uploading database'
                            self.firefoxDownloadJob(book, link.get('href'))
                            self.updateDatabase()
                        except:
                            print link.get('href')
                            traceback.print_exc()

    def updateDatabase(self):
        self.createDatabase.creatingDatabase()
        self.createDatabase.addingData()

    def isBookNameAvailableInDatabase(self, bookName=None):
        isBookPresent = False
        book = self.createDatabase.findByBookName(bookName)
        if book:
            isBookPresent = True
        return isBookPresent

    def findBookDetail(self, subUrl):
        ''' This method will download book cover.
         It will provide book object.'''
        book = None
        url = self.baseUrl + '/' + subUrl
        r = requests.get(url, headers=self.header_info, timeout=30)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "lxml")

            book = Book()
            book.authors.append(
                Author(soup.find_all(itemprop="author")[0].text))
            book.isbn_10 = soup.find_all(itemprop="isbn")[0].text
            book.isbn_13 = soup.find_all(itemprop="isbn")[1].text
            book.bookName = soup.find_all(itemprop="name")[0].text
            book.publisher = soup.find_all(itemprop="publisher")[0].text

            try:
                date = datetime.strptime(
                    str(soup.find_all(itemprop="datePublished")[0].text), '%Y')
            except:
                date = datetime.now()
            book.publishedOn = date

            book.numberOfPages = soup.find_all(
                itemprop="numberOfPages")[0].text
            book.inLanguage = soup.find_all(itemprop="inLanguage")[0].text
            book.bookFormat = soup.find_all(itemprop="bookFormat")[0].text
            book.bookDescription = soup.find_all("div",
                                                 {"class": "span12"})[3].text
            print soup.find_all(itemprop="image")
            book.bookImgName = (soup.find_all(itemprop="image")[0]).get('src')
            try:
                book.subTitle = soup.find_all("div",
                                              {"class": "span12"})[1].text
            except:
                traceback.print_exc()

#             book.fileSize = soup.find_all('table')[3].find_all('tr')[7].find_all('td')[1].find_all('b')[0].text
            book.fileSize = soup.find_all(
                'table', {"class": "table table-bordered"
                          })[1].find_all('tr')[5].find_all('td')[1].text

    #         book.fileSize=

    #         .top > div:nth-child(2) > h3:nth-child(2)

#             for link in soup.find_all('a'):
#                 if link.get('href').startswith('http://filepi.com'):
#                     book.name = link.text
#                     break
        return book

    def firefoxDownloadJob(self, book, refUrl):
        '''The function of this method is to download link of given URL.'''
        # Creating directory
        directory_name = self.downloadDir()

        # Creating Actual URL
        url = self.baseUrl + refUrl

        lsFiles = []

        # Checking if there are three files in this URL.
        # Creating a list of absolute files.
        if 3 == len(os.listdir(directory_name)):
            for sName in os.listdir(directory_name):
                if os.path.isfile(os.path.join(directory_name, sName)):
                    lsFiles.append(sName)

        # Checking if there are more than 3 files in the directory location.
        # Removing all the files from direcotry.
        elif 3 != len(os.listdir(directory_name)):
            for sName in os.listdir(directory_name):
                os.remove(directory_name + '/' + sName)

            imageUrl = self.baseUrl + book.bookImgName
            subUrl = book.bookImgName
            imageFileName = subUrl.split('/')[-1:][0]

            # Downloading book cover
            bookImagePath = os.path.join(directory_name,
                                         subUrl.split('/')[-1:][0])
            #             urllib.urlretrieve(imageUrl,bookImagePath)
            from PIL import Image
            from StringIO import StringIO
            r = requests.get(imageUrl, headers=self.header_info, timeout=30)
            print '--------------->', r.url
            with open(bookImagePath, 'wb') as imageFile:
                imageFile.write(r.content)

            book.bookImgName = imageFileName
            #writing json file
            self.writeJsonToDir(directory_name, book)

            fp = webdriver.FirefoxProfile()

            fp.set_preference("webdriver.log.file", "/tmp/firefox_console")
            fp.set_preference("browser.download.folderList", 2)
            fp.set_preference('browser.download.manager.showWhenStarting',
                              True)
            fp.set_preference('browser.download.manager.focusWhenStarting',
                              True)
            fp.set_preference("browser.download.dir", directory_name)
            fp.set_preference("browser.download.manager.scanWhenDone", True)
            fp.set_preference("browser.download.manager.useWindow", True)
            fp.set_preference("browser.helperApps.neverAsk.saveToDisk",
                              "application/octet-stream")
            fp.update_preferences()
            driver = webdriver.Chrome()
            # driver.find_element_by_xpath("html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[1]/img")
            driver.get(url)
            efd_link = driver.find_element_by_id(id_='download')
            #             efd_link.click()
            efd_link.send_keys(Keys.RETURN)
            flag = True
            while (flag):
                # # checking part file
                time.sleep(10)
                lst = []
                files = []
                for sName in os.listdir(directory_name):
                    if os.path.isfile(os.path.join(directory_name, sName)):
                        lst.append(sName.split('.')[-1:][0])
                        files.append(os.path.join(directory_name, sName))
                print lst
                if 'part' not in lst:
                    flag = False
                    time.sleep(10)
                    driver.close()
                else:
                    #print files
                    #                     if not self.isBookDownloading(files):
                    #                         driver.close()
                    pass

    def writeJsonToDir(self, bookPath=None, book=None):
        '''
        this function will write json file to given dir.
        '''
        try:
            f = open(os.path.join(bookPath, 'book.json'), 'w')
            row2dict = book.__dict__
            authors = []
            if type(row2dict['publishedOn']) == datetime:
                row2dict['publishedOn'] = str(row2dict['publishedOn'])
            for a in row2dict['authors']:
                author = {}
                if type(a) == str:
                    author['authorName'] = a
                else:
                    author = a.__dict__

                authors.append(author)
            row2dict['authors'] = authors
            f.write(json.dumps(row2dict, sort_keys=False, indent=4))
            f.close()
        except:
            traceback.print_exc()

    def isBookDownloading(self, files):
        ''' This method will inform that book is getting downloading or not.'''
        #time.sleep(2)
        dic_files = {}
        time_dic_files = {}
        i = 1
        checkFlagForSize = True
        isDownloading = True
        for fl in files:
            dic_files[fl] = str(os.stat(fl).st_size)
        while (checkFlagForSize):

            time_dic_files[i] = dic_files
            i = i + 1
            if i > 4:
                size = set()
                for k in time_dic_files[i - 1]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 1][k])
                for k in time_dic_files[i - 2]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 2][k])
                for k in time_dic_files[i - 3]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 3][k])
#                 print len(list(size))
                if len(list(size)) > 1:
                    isDownloading = False
            checkFlagForSize = False
        logging.info('isDownloading:')
        return isDownloading

    def startDownload(self):
        baseUrl = 'http://itebooks.website'
        itebook = ItEbook(baseUrl)
        # TODO need to be updated
        logicTrue = True
        i = 2
        while logicTrue:
            subUrl = 'page-' + str(i) + '.html'
            itebook.findAllBookUrl(subUrl)
            i = i + 1
            print 'startDownload---------->', str(i)


#             if i==4:
#                 break

    def getMaxBookID(self):
        maxBookId = self.createDatabase.getMaxBookID()
        if not maxBookId:
            maxBookId = 0
        return maxBookId

    def downloadDir(self):
        '''
        This function will create directory to download book.
        @param number:it takes database maxId+1 to create new directory . 
        '''
        directory_name = os.path.join(self.directory_name,
                                      str(self.getMaxBookID() + 1))
        if not os.path.exists(directory_name):
            os.makedirs(directory_name)
            os.chdir(directory_name)
        return directory_name
示例#4
0
class AddBook():
    '''
    This class have been written to add book to Opal workspace library.
    '''
    def __init__(self, libraryPath=None):
        self.book = Book()
        self.book.uuid = str(uuid.uuid4())
        self.book.tag = None
        self.book.authors = list()
        self.libraryPath = libraryPath
        self.createDatabase = CreateDatabase(libraryPath=libraryPath)

    def getMaxBookID(self):

        maxBookId = self.createDatabase.getMaxBookID()
        if maxBookId == None:
            maxBookId = 0
        return maxBookId

    def addingBookToWorkspace(self, sourcePath=None, maxBookId=None):
        '''
        This function will be creating a new dir. Get the max of id in Book table. Create the folder name with max of id plus one.
        @param sourcePath: This is the path of selected book.
        -1. Check if database present in workspace. There is possibility of a new workspace.
        0. Check if book already present in workspace.
        1. Create a folder with max_book_id+1 . 
        2. Copy the book file in the directory.
        3. Create metadata i.e. (book.json)
        4. Make an entry in database.
        '''

        if sourcePath:
            #             if maxBookId:
            #                 maxBookId = self.createDatabase.getMaxBookID()
            #
            #                 if maxBookId == None:
            #                     maxBookId = 0
            #             workspacePath = Workspace().libraryPath
            self.book.bookPath = os.path.join(self.libraryPath,
                                              str(maxBookId + 1))

            head, tail = os.path.split(sourcePath)
            self.book.bookFileName = tail

            self.book.inLanguage = 'English'
            self.book.hasCover = 'Y'

            splited_name = tail.split(".")
            self.book.bookFormat = splited_name[-1:][0]
            splited_name.remove(self.book.bookFormat)
            book_file_name = '.'.join(splited_name)
            self.book.bookName = book_file_name
            self.book.wishListed = 'No'

            if not self.findingSameBook():

                self.book.bookPath = os.path.join(self.libraryPath,
                                                  str(maxBookId + 1))
                if not os.path.exists(self.book.bookPath):
                    os.makedirs(self.book.bookPath)

                dest = os.path.join(self.book.bookPath, tail)
                if sourcePath != dest:
                    shutil.copy(sourcePath, dest)

                if 'pdf' == self.book.bookFormat:
                    self.getPdfMetadata(sourcePath)
                if 'epub' == self.book.bookFormat:
                    self.getEpubMetadata(sourcePath)
                    pass

                os.chdir(self.book.bookPath)
                self.book.bookImgName = book_file_name + '.jpg'
                BookImage().getBookImage(self.book.bookPath, book_file_name,
                                         self.book.bookFormat)

                book_copy1 = copy.deepcopy(self.book)
                self.writeBookJson(self.book.bookPath, book_copy1)
                self.addingBookInfoInDatabase(self.book)

    def getImageFileName(self):
        imgFilePath = os.path.join(self.book.bookPath, self.book.bookImgName)
        if not os.path.exists(imgFilePath):
            directory = '.'
            pattern = re.compile(r"\-(\d*)\.jpg$")
            for file in os.listdir(directory):
                print(file)
                m = pattern.search(file)
                if m:
                    #                     print(m.groups())
                    imgFilePath = m.group()
                    bookImgName = self.currentBook.bookImgName.replace(
                        '.jpg', m.group())
                    imgFilePath = os.path.join(self.currentBook.bookPath,
                                               bookImgName)
        return bookImgName

    def findingSameBook(self):
        '''
        This method will allow you to find the same book available in workspace already.
        1. check for same book name.
        2. check for isbn.
        
        '''
        logger.debug('findingSameBook')
        isSameBookPresent = False
        books = self.createDatabase.findBookByFileName(self.book.bookFileName)
        logger.debug('len(books): %s', len(books))
        if len(books) > 0:
            isSameBookPresent = True
        return isSameBookPresent

    def addingBookInfoInDatabase(self, book):
        '''
        This method will add new book info in database.
        '''
        logger.debug('addingBookInfoInDatabase')
        self.createDatabase.saveBook(book)

    def writeBookJson(self, newDirPath=None, book=None):
        '''
        This function will write book.json (metadata) of the newly added book in workspace.
        '''
        logger.debug('writeBookJson newDirPath: %s', newDirPath)
        f = open(os.path.join(newDirPath, 'book.json'), 'w')
        row2dict = dict(book.__dict__)
        authors = []
        try:
            for a in row2dict['authors']:
                author = {}
                if type(a) == str:
                    author['authorName'] = a
                else:
                    author = a.__dict__
                if '_sa_instance_state' in author:
                    del author['_sa_instance_state']
                if 'book_assoc' in author:
                    del author['book_assoc']
                authors.append(author)
            if '_sa_instance_state' in row2dict:
                del row2dict['_sa_instance_state']
            if 'authors' in row2dict:
                del row2dict['authors']
            if 'book_assoc' in row2dict:
                del row2dict['book_assoc']

            row2dict['authors'] = authors
            row2dict['publishedOn'] = str(datetime.now())
            row2dict['createdOn'] = str(datetime.now())
        except Exception as e:
            logger.error(e)
#             print newDirPath
#             print row2dict
        f.write(json.dumps(row2dict, sort_keys=True, indent=4))

        f.close()

    def getEpubMetadata(self, path=None):
        logger.debug('getEpubMetadata')
        os.chdir(self.book.bookPath)
        file_name = self.book.bookName + '.epub'
        epubBook = EpubBook()
        epubBook.open(file_name)

        epubBook.parse_contents()

        authorList = list()
        for authorName in epubBook.get_authors():

            author = Author()
            author.authorName = authorName
            author.aboutAuthor = 'aboutAuthor'
            authorList.append(author)
        self.book.authors = authorList

        self.book.tag = epubBook.subjectTag
        epubBook.extract_cover_image(outdir='.')
        self.book.createdOn = datetime.now()

    def getPdfMetadata(self, path=None):
        '''
        This method will get the pdf metadata and return book object.
        '''
        logger.debug('getPdfMetadata path: %s', path)

        if path:
            try:
                input = PdfFileReader(open(path, "rb"))
                logger.debug('getIsEncrypted : %s ', input.getIsEncrypted())
            except Exception as e:
                logger.error(e, exc_info=True)
            pdf_info = None
            try:
                pdf_toread = PdfFileReader(open(path, "rb"))
                if pdf_toread.isEncrypted:
                    try:
                        pdf_toread.decrypt('')
                    except Exception as e:
                        logger.error(e, exc_info=True)
            except Exception as e:
                logger.error(e, exc_info=True)
            try:
                pdf_info = pdf_toread.getDocumentInfo()
                logger.debug('NumPages:%s', pdf_toread.getNumPages())
                self.book.numberOfPages = pdf_toread.getNumPages()
                #             value = pdf_info.subject
                subject = None
                if pdf_info.subject and type(pdf_info.subject) == str:
                    # Ignore errors even if the string is not proper UTF-8 or has
                    # broken marker bytes.
                    # Python built-in function unicode() can do this.
                    subject = pdf_info.subject

#                 else:
#                     # Assume the value object has proper __unicode__() method
#                     value = unicode(pdf_info.subject)
#                     print 'else'
                if not self.book.tag and subject:
                    self.book.tag = subject
                elif self.book.tag and subject:
                    self.book.tag = self.book.tag + '' + subject
            except Exception as e:
                logger.error(e, exc_info=True)
            try:
                if pdf_info.title != None and pdf_info.title.strip() != '':
                    self.book.bookName = str(pdf_info.title)
            except Exception as e:
                logger.error(e, exc_info=True)

            try:
                if pdf_info.creator:
                    self.book.publisher = str(pdf_info.creator.encode('utf-8'))
            except Exception as e:
                logger.error(e, exc_info=True)
            self.book.createdOn = datetime.now()
            try:
                #                 print str(pdf_info['/CreationDate'])[2:10]
                date = datetime.strptime(
                    str(pdf_info['/CreationDate'])[2:10], '%Y%m%d')
                self.book.publishedOn = date
            except Exception as e:
                logger.error(e, exc_info=True)
                logger.error('CreationDate not found')

            logger.debug(Util().convert_bytes(os.path.getsize(path)))
            self.book.fileSize = Util().convert_bytes(os.path.getsize(path))

            #             if 'ISBN'.lower() in str(pdf_info['/Subject']).lower():
            #                 self.book.isbn_13 = str(pdf_info['/Subject'])[6:]

            author = Author()
            val = 'Unknown'
            try:
                if pdf_info.author != None and pdf_info.author.strip() != '':
                    val = pdf_info.author


#                     val = val.encode("utf8", "ignore")
            except Exception as e:
                logger.error(e, exc_info=True)
            author.authorName = val

            authorList = list()
            authorList.append(author)
            self.book.authors = authorList
示例#5
0
class PacktpubCrawl:
    def __init__(self):
        self.baseUrl = "https://www.packtpub.com/"
        self.directory_name = Workspace().libraryPath
        self.createDatabase = CreateDatabase() 

    def findBookUrl(self):
        directory_name = '.'
        binary = FirefoxBinary('/docs/python_projects/firefox/firefox')

        fp = webdriver.FirefoxProfile()

        fp.set_preference("webdriver.log.file", "/tmp/firefox_console");
        fp.set_preference("browser.download.folderList", 2)
        fp.set_preference('browser.download.manager.showWhenStarting', False)
        fp.set_preference('browser.download.manager.focusWhenStarting', False)
        fp.set_preference("browser.download.dir", directory_name)
        fp.set_preference("browser.download.manager.scanWhenDone", False)
        fp.set_preference("browser.download.manager.useWindow", False)
#             fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream")
        fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream,application/xml,application/pdf,text/plain,text/xml,image/jpeg,text/csv,application/zip,application/x-rar-compressed");
        fp.set_preference("browser.helperApps.alwaysAsk.force", False);
        fp.set_preference("browser.popups.showPopupBlocker", False);
        fp.update_preferences()
        driver = webdriver.Firefox(firefox_profile=fp, firefox_binary=binary)
        # driver.find_element_by_xpath("html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[1]/img")
        driver.get(self.baseUrl)
        efd_link = driver.find_element_by_css_selector(".login-popup > div:nth-child(1)")
        efd_link.click()
        try:
            emailEl = driver.find_element_by_css_selector('#packt-user-login-form > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > input:nth-child(1)')
#             emailEl = driver.find_element_by_name("email")
            '''
            Login with user credential
            '''
            emailEl.send_keys('*****@*****.**')
            passwordEl = driver.find_element_by_css_selector("#packt-user-login-form > div:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > input:nth-child(1)")
            passwordEl.send_keys('default')
            loginEl = driver.find_element_by_css_selector("#packt-user-login-form > div:nth-child(1) > div:nth-child(1) > div:nth-child(3) > input:nth-child(1)")
            loginEl.click()
            
            if True:
                '''
                clicking on My Account
                '''
                myAccountEl = driver.find_element_by_css_selector('#account-bar-logged-in > a:nth-child(1) > div:nth-child(1) > strong:nth-child(1)')
                myAccountEl.click()
                
                '''
                clicking My ebooks
                '''
                myEbook = driver.get(self.baseUrl + 'account/my-ebooks')
                productListEls = driver.find_elements_by_css_selector('div.product-line')
                print len(productListEls)
                bookList = list()
                for productEl in productListEls:
                    print productEl
                    
                    try:
                        bookName = productEl.find_element_by_css_selector('.title').text
                        book = self.createBookDetail(bookName)
                        productEl.click()
                        readMeEl = productEl.find_element_by_css_selector('.fake-button-text')
                        print 'new page',
                        isbnEl = productEl.find_elements_by_css_selector('div > div:nth-child(2) > div:nth-child(1)> a:nth-child(1) > div:nth-child(1)')
                        book.isbn_13 = isbnEl[0].get_attribute('isbn')
#                     readMeEl.click()
                        print 'div.product-line:nth-child(1) > div:nth-child(2) > div:nth-child(1) > a:nth-child(1) > div:nth-child(1)',
#                     readMeEl.find_element_by_css_selector('h2.ng-binding')
#                     
#                     readingEl = driver.get('https://www.packtpub.com/mapt/book/All%20Books/' + book.isbn_13)
#                     bookName1=driver.find_elements_by_css_selector('h2.ng-binding')[0].text
                    
                        bookList.append(book)
                    except Exception as e:
                        print e
#                 product_account_list_el=driver.find_elements_by_css_selector('#product-account-list')
            
            driver.get('https://www.packtpub.com/packt/offers/free-learning')
            try:
                '''
                clicking on Claim your free ebook
                '''
                bookNameEl_1 = driver.find_element_by_css_selector('.dotd-title > h2:nth-child(1)')
                isBookAlreadyAvailable = False
                bookName_1 = bookNameEl_1.text
                for book in bookList:
                    if bookName_1 in book.bookName:
                        isBookAlreadyAvailable = True
                        break
                        
                if not isBookAlreadyAvailable:
                    claimFreeEbookEl = driver.find_element_by_css_selector('.book-claim-token-inner > input:nth-child(3)')
                    claimFreeEbookEl.click()
            except Exception as e:
                print e
                
#             myEbook.click()
            
        except Exception as e:
            print e
        finally:
            print 'completed'
        print 'hi'

    def createBookDetail(self, bookName=None):
        book = Book()   
        book.bookName = bookName
        book.bookFormat = 'pdf'
        book.tag = 'Technology'
        book.inLanguage = 'English'
        book.subTitle = None
        book.publisher = "Packt Publishing Limited"
        book.bookImgName = bookName + '.jpg'
        book.hasCover = 'Yes'
        book.hasCode = None
        
        return book
    
    def getMaxBookID(self):
        '''
        This function will get max book id.
        @param number:it takes database maxId+1 to create new directory . 
        '''
        maxBookId = self.createDatabase.getMaxBookID()
        if not maxBookId:
            maxBookId = 0        
        return maxBookId

    def downloadDir(self):
        '''
        This function will create directory to download book.
        @param number:it takes database maxId+1 to create new directory . 
        '''
        directory_name = os.path.join(self.directory_name, str(self.getMaxBookID() + 1))
        if not os.path.exists(directory_name):
            os.makedirs(directory_name)
            os.chdir(directory_name)
        return directory_name    
示例#6
0
class ItEbook(object):
    '''
    This class downloads first page of itebookinfo
    '''
    def __init__(self, baseUrl=None):
        '''
        Constructor
        '''
        self.baseUrl = baseUrl
        self.directory_name = Workspace().libraryPath
        self.createDatabase = CreateDatabase()
        pass

    def getUrl(self, baseUrl):
        '''this method will find and constuct all url of url given'''
        return self.baseUrl

    def findAllBookUrl(self):
        '''
        This method retrive all the book url avaialbe in the page.
        '''
        content = urllib2.urlopen(self.baseUrl).read()
        soup = BeautifulSoup(content, "lxml")

        skipList = ('IT eBooks', 'IT eBooks Group', u'IT-eBooks.Info',
                    u'IT-eBooks API', u'IT-eBooks Search', 'Tweet')
        listOfBookName = list()
        for link in soup.find_all('a'):
            if link.text.strip() != '' and link.text not in skipList:
                listOfBookName.append(link.text)
                isBookAvailable = self.isBookNameAvailableInDatabase(link.text)
                if not isBookAvailable:
                    print link.text, '\t', link.get('href'), isBookAvailable
                    book = self.findBookDetail(link.get('href'))
                    #                     print book
                    try:
                        self.firefoxDownloadJob(book, link.get('href'))
                        self.updateDatabase()
                    except:
                        print link.get('href')
                        traceback.print_exc()

    def updateDatabase(self):
        self.createDatabase.creatingDatabase()
        self.createDatabase.addingData()

    def isBookNameAvailableInDatabase(self, bookName=None):
        isBookPresent = False
        book = self.createDatabase.findByBookName(bookName)
        if book:
            isBookPresent = True
        return isBookPresent

    def findBookDetail(self, number):
        ''' This method will download book cover.
         It will provide book object.'''

        url = self.baseUrl + number
        content = urllib2.urlopen(url).read()
        soup = BeautifulSoup(content, "lxml")
        book = Book()
        book.authors.append(Author(soup.find_all(itemprop="author")[0].text))
        book.isbn_13 = soup.find_all(itemprop="isbn")[0].text
        book.bookName = soup.find_all(itemprop="name")[0].text
        book.publisher = soup.find_all(itemprop="publisher")[0].text

        try:
            date = datetime.strptime(
                str(soup.find_all(itemprop="datePublished")[0].text), '%Y')
        except:
            date = datetime.now()
        book.publishedOn = date

        book.numberOfPages = soup.find_all(itemprop="numberOfPages")[0].text
        book.inLanguage = soup.find_all(itemprop="inLanguage")[0].text
        book.bookFormat = soup.find_all(itemprop="bookFormat")[0].text
        book.bookDescription = soup.find_all(itemprop="description")[0].text
        book.bookImgName = (soup.find_all(itemprop="image")[0]).get('src')
        try:
            book.subTitle = soup.h3.text
        except:
            traceback.print_exc()
        book.fileSize = soup.find_all('table')[3].find_all('tr')[7].find_all(
            'td')[1].find_all('b')[0].text
        #         book.fileSize=

        #         .top > div:nth-child(2) > h3:nth-child(2)

        for link in soup.find_all('a'):
            if link.get('href').startswith('http://filepi.com'):
                book.name = link.text
                break
        return book

    def firefoxDownloadJob(self, book, refUrl):
        '''The function of this method is to download link of given URL.'''
        # Creating directory
        directory_name = self.downloadDir()

        # Creating Actual URL
        url = self.baseUrl + refUrl

        lsFiles = []

        # Checking if there are three files in this URL.
        # Creating a list of absolute files.
        if 3 == len(os.listdir(directory_name)):
            for sName in os.listdir(directory_name):
                if os.path.isfile(os.path.join(directory_name, sName)):
                    lsFiles.append(sName)

        # Checking if there are more than 3 files in the directory location.
        # Removing all the files from direcotry.
        elif 3 != len(os.listdir(directory_name)):
            for sName in os.listdir(directory_name):
                os.remove(directory_name + '/' + sName)

            imageUrl = self.baseUrl + book.bookImgName
            subUrl = book.bookImgName
            imageFileName = subUrl.split('/')[-1:][0]

            # Downloading book cover
            bookImagePath = os.path.join(directory_name,
                                         subUrl.split('/')[-1:][0])
            urllib.urlretrieve(imageUrl, bookImagePath)
            book.bookImgName = imageFileName
            #writing json file
            self.writeJsonToDir(directory_name, book)
            binary = FirefoxBinary('/docs/python_projects/firefox/firefox')

            fp = webdriver.FirefoxProfile()

            fp.set_preference("webdriver.log.file", "/tmp/firefox_console")
            fp.set_preference("browser.download.folderList", 2)
            fp.set_preference('browser.download.manager.showWhenStarting',
                              False)
            fp.set_preference('browser.download.manager.focusWhenStarting',
                              False)
            fp.set_preference("browser.download.dir", directory_name)
            fp.set_preference("browser.download.manager.scanWhenDone", False)
            fp.set_preference("browser.download.manager.useWindow", False)
            fp.set_preference("browser.helperApps.neverAsk.saveToDisk",
                              "application/octet-stream")
            fp.update_preferences()
            driver = webdriver.Firefox(firefox_profile=fp,
                                       firefox_binary=binary)
            # driver.find_element_by_xpath("html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[1]/img")
            driver.get(url)
            efd_link = driver.find_element_by_link_text(book.name)
            book.fileSize = driver.find_element_by_xpath(
                "html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[2]/table/tbody/tr[8]/td[2]/b"
            ).text
            book.bookFormat = driver.find_element_by_xpath(
                "html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[2]/table/tbody/tr[9]/td[2]/b"
            ).text
            efd_link.click()
            flag = True
            while (flag):
                # # checking part file
                time.sleep(10)
                lst = []
                files = []
                for sName in os.listdir(directory_name):
                    if os.path.isfile(os.path.join(directory_name, sName)):
                        lst.append(sName.split('.')[-1:][0])
                        files.append(os.path.join(directory_name, sName))
                print lst
                if 'part' not in lst:
                    flag = False
                    time.sleep(10)
                    driver.close()
                else:
                    #print files
                    #                     if not self.isBookDownloading(files):
                    #                         driver.close()
                    pass

    def writeJsonToDir(self, bookPath=None, book=None):
        '''
        this function will write json file to given dir.
        '''
        try:
            f = open(os.path.join(bookPath, 'book.json'), 'w')
            row2dict = book.__dict__
            authors = []
            if type(row2dict['publishedOn']) == datetime:
                row2dict['publishedOn'] = str(row2dict['publishedOn'])
            for a in row2dict['authors']:
                author = {}
                if type(a) == str:
                    author['authorName'] = a
                else:
                    author = a.__dict__

                authors.append(author)
            row2dict['authors'] = authors
            f.write(json.dumps(row2dict, sort_keys=False, indent=4))
            f.close()
        except:
            traceback.print_exc()

    def isBookDownloading(self, files):
        ''' This method will inform that book is getting downloading or not.'''
        #time.sleep(2)
        dic_files = {}
        time_dic_files = {}
        i = 1
        checkFlagForSize = True
        isDownloading = True
        for fl in files:
            dic_files[fl] = str(os.stat(fl).st_size)
        while (checkFlagForSize):

            time_dic_files[i] = dic_files
            i = i + 1
            if i > 4:
                size = set()
                for k in time_dic_files[i - 1]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 1][k])
                for k in time_dic_files[i - 2]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 2][k])
                for k in time_dic_files[i - 3]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 3][k])


#                 print len(list(size))
                if len(list(size)) > 1:
                    isDownloading = False
            checkFlagForSize = False
        logging.info('isDownloading:')
        return isDownloading

    def startDownload(self):
        baseUrl = 'http://it-ebooks.info'
        itebook = ItEbook(baseUrl)
        # TODO need to be updated
        itebook.findAllBookUrl()

    def getMaxBookID(self):
        maxBookId = self.createDatabase.getMaxBookID()
        if not maxBookId:
            maxBookId = 0
        return maxBookId

    def downloadDir(self):
        '''
        This function will create directory to download book.
        @param number:it takes database maxId+1 to create new directory . 
        '''
        directory_name = os.path.join(self.directory_name,
                                      str(self.getMaxBookID() + 1))
        if not os.path.exists(directory_name):
            os.makedirs(directory_name)
            os.chdir(directory_name)
        return directory_name
示例#7
0
class FullCircleMagazine():
    
    def __init__(self, baseUrl=None):
        self.baseUrl = baseUrl
        self.directory_name = Workspace().libraryPath
        self.createDatabase = CreateDatabase() 
        self.header_info = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0'}
        
        # book image url
        self.imageUrl = None
        self.bookUrl = None
        pass
    
    def downloadFullCircleMagazine(self, url, book=None, bookUrl=None):
        '''
        AQGPK3595C
        '''
#         url = 'http://dl.fullcirclemagazine.org/issue1_en.pdf'
#         'http://dl.fullcirclemagazine.org/issue3_en.pdf'
        directory_name = self.createDownloadDir()
        bookImagePath = os.path.join(directory_name, book.bookImgName)
        os.chdir(directory_name)
        r = requests.get(url, headers=self.header_info, timeout=30)
        if r.status_code == 200:
            print r.status_code, url
            print '------->', int(r.headers["content-length"]) / 1000000
            book.fileSize = str(round(int(r.headers["content-length"]) / 1000000 , 2)) + ' MB'
            self.writeJsonToDir(directory_name, book)
            self.downloadBookImage(bookImagePath, self.imageUrl)
#             r = requests.get(bookUrl, headers=self.header_info, timeout=30)
            print '--------------->', r.url
            bookPath = os.path.join(directory_name, url.split('/')[-1])
            print bookPath
            with open(bookPath, 'wb') as bookFile:
                
                bookFile.write(r.content)
            self.updateDatabase(directory_name)
        return r.status_code, directory_name  
    
    def createBookDetail(self, bookName=None):
        book = Book()   
        book.bookName = "Full Circle "+ bookName
        book.bookFormat = 'pdf'
        book.tag = 'Technology'
        book.inLanguage = 'English'
        book.subTitle = 'Magazine'
        book.publisher = "Full Circle"
        book.bookImgName = bookName + '.jpg'
        book.hasCover = 'Yes'
        book.hasCode = 'No'
        return book
            
    def writeJsonToDir(self, bookPath=None, book=None):
        try:
            f = open(os.path.join(bookPath, 'book.json'), 'w')
            row2dict = book.__dict__
            authors = []
            if type(row2dict['publishedOn']) == datetime:
                row2dict['publishedOn'] = str(row2dict['publishedOn'])
            for a in row2dict['authors']:
                author = {}
                if type(a) == str:
                    author['authorName'] = a
                else:
                    author = a.__dict__
                
                authors.append(author)
            row2dict['authors'] = authors
            if not row2dict['isbn_13'] == None:
                if str(row2dict['isbn_13']).strip() == '':
                    row2dict['isbn_13'] = None
            f.write(json.dumps(row2dict, sort_keys=False, indent=4))
            f.close()     
        except:
            traceback.print_exc()   
            
    def downloadBookImage(self, bookImagePath=None, imageUrl=None):
        '''
        this method will download image from imageUrl location and keep it at bookImagePath
        '''
        print imageUrl
        head, data = imageUrl.split(',', 1)
        bits = head.split(';')
        mime_type = bits[0] if bits[0] else 'text/plain'
        charset, b64 = 'ASCII', False
        for bit in bits:
            if bit.startswith('charset='):
                charset = bit[8:]
            elif bit == 'base64':
                b64 = True
        
        # Do something smart with charset and b64 instead of assuming
        plaindata = data.decode("base64")
        
        # Do something smart with mime_type
        with open(bookImagePath, 'wb') as f:
            f.write(plaindata)

        print 'write image complete'
#         from PIL import Image   
#         from StringIO import StringIO
#         r = requests.get(imageUrl, headers=self.header_info, timeout=30)
#         print '--------------->', r.url
#         with open(bookImagePath, 'wb') as imageFile:
#             imageFile.write(r.content)    


    def updateDatabase(self, directory_name):
#         self.createDatabase.creatingDatabase()  
#         self.createDatabase.addingData() 
        self.createDatabase.addSingleBookData(directory_name)
           
    def isIsbnAvailableInDatabase(self, isbn_13=None):
        isBookPresent = False
        book = self.createDatabase.findByIsbn_13Name(isbn_13)
        if book:
            isBookPresent = True
        return isBookPresent
    
    def isBookNameAvailableInDatabase(self, bookName=None):
        isBookPresent = False
        book = self.createDatabase.findByBookName(bookName)
        if book:
            isBookPresent = True
        return isBookPresent
    
    def createDownloadDir(self):
        '''
        This function will create directory to download book.
        @param number:it takes database maxId+1 to create new directory . 
        '''
        directory_name = os.path.join(self.directory_name, str(self.getMaxBookID() + 1))
        if not os.path.exists(directory_name):
            os.makedirs(directory_name,755)
            os.chdir(directory_name)
        return directory_name
    
    def getMaxBookID(self):
        maxBookId = self.createDatabase.getMaxBookID()
        if not maxBookId:
            maxBookId = 0        
        return maxBookId
    
    
    def getImageUrl(self, completeUrl, issueCount):
        print completeUrl
        imageUrl = None
        r = requests.get(completeUrl, headers=self.header_info, timeout=30)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "lxml")
#             print soup
            alt = soup.find(class_='issuetable').find('img')['alt']
            if alt == 'Cover for Issue '+issueCount+' in English':
                imageUrl = soup.find(class_='issuetable').find('img')['src']
                print imageUrl
        return imageUrl
    
    def startDownload(self):
        logic = True
        i = 1
        while logic:
            pdfUrl = 'http://dl.fullcirclemagazine.org/issue' + str(i) + '_en.pdf'
            completeUrl = 'http://fullcirclemagazine.org/issue-' + str(i) + '/'
            if not self.isIssuePresent(str(i)):
                self.imageUrl = self.getImageUrl(completeUrl,str(i))
                book = self.createBookDetail('Issue ' + str(i))
                status_code, directory_name = self.downloadFullCircleMagazine(book=book, url=pdfUrl)
                print completeUrl, status_code
                if status_code != 200:
                    logic = False
            i = i + 1
    
    
    def isIssuePresent(self, issue=None):
        isBookPresent = False
        bookName="Full Circle Issue " + issue
        book = self.createDatabase.findByBookName(bookName)
        if book:
            isBookPresent = True
        return isBookPresent

    def getIssueDetail(self):
        url='https://wiki.ubuntu.com/UbuntuMagazine/FullIssueIndex'
        r = requests.get(url, headers=self.header_info, timeout=30)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "lxml") 
            tables=soup.findAll('table')
            for table in tables:
                print table