示例#1
0
class ItEbook(object):
    '''
    This class downloads first page of itebookinfo
    '''
    def __init__(self, baseUrl=None):
        '''
        Constructor
        '''
        self.baseUrl = baseUrl
        self.directory_name = Workspace().libraryPath
        self.createDatabase = CreateDatabase()
        self.header_info = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0'
        }

        # book image url
        self.imageUrl = None
        self.bookUrl = None
        pass

    def getUrl(self, baseUrl):
        '''this method will find and constuct all url of url given'''
        return self.baseUrl

    def findAllBookUrl(self, subUrl=None):
        '''
        This method retrive all the book url avaialbe in the page.
        http://itebooks.website/page-2.html
        '''
        url = self.baseUrl + '/' + subUrl
        #         print url
        #         content = urllib2.urlopen(url).read()
        r = requests.get(url, headers=self.header_info, timeout=30)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "lxml")

            skipList = [
                'HOME', 'Category', 'Animals', 'Architecture', 'Art',
                'Astronomy', 'Biography', 'Biology', 'Business', 'Chemistry',
                'Cinema', 'Cookbooks', 'Cryptography', 'Culture', 'Design',
                'Drawing', 'Economics', 'Encyclopedia and Dictionary',
                'Engineering and Technology', 'Family and Friendship',
                'Fitness', 'Gambling', 'Games', 'Hardware', 'Healthcare',
                'History', 'Hobbies', 'Information Technologies', 'IT ebooks',
                'Languages', 'Martial Arts', 'Mathematics', 'Medicine',
                'Military', 'Music', 'Novels', 'Other', 'Personality',
                'Philosophy', 'Photo', 'Physics', 'Poetry',
                'Politics and Sociology', 'Programming', 'Psychology',
                'Relationships', 'Religion', 'Science', 'Security',
                'Sexuality', 'Software', 'Sport', 'Travel', 'Web Development'
            ]
            #             with open(os.path.dirname(__file__) + os.sep + 'skipList.txt', 'r') as f:
            #                 for line in f:
            #                     skipList.append(line.rstrip('\n'))
            #                 f.close
            listOfBookName = list()
            for link in soup.find_all('a', 'title'):
                if link.text.strip() != '' and link.text not in skipList:

                    listOfBookName.append(link.text)

                    isBookAvailable = self.isBookNameAvailableInDatabase(
                        link.text)
                    #                     self.isIsbnAvailableInDatabase()
                    #                     print isBookAvailable, link.text
                    if not isBookAvailable:
                        #                         print link.text, '\t', link.get('href'), isBookAvailable
                        book, bookUrl = self.findBookDetail(link.get('href'))
                        isBookAvailable = self.isIsbnAvailableInDatabase(
                            book.isbn_13)
                        #                     print book
                        if not isBookAvailable:
                            try:
                                print 'uploading database'
                                directory_name = self.downloadEbook(
                                    book, link.get('href'), bookUrl)
                                self.updateDatabase(directory_name)
                            except:
                                print link.get('href')
                                traceback.print_exc()

    def updateDatabase(self, directory_name):
        #         self.createDatabase.creatingDatabase()
        #         self.createDatabase.addingData()
        self.createDatabase.addSingleBookData(directory_name)

    def isIsbnAvailableInDatabase(self, isbn_13=None):
        isBookPresent = False
        book = self.createDatabase.findByIsbn_13Name(isbn_13)
        if book:
            isBookPresent = True
        return isBookPresent

    def isBookNameAvailableInDatabase(self, bookName=None):
        isBookPresent = False
        book = self.createDatabase.findByBookName(bookName)
        if book:
            isBookPresent = True
        return isBookPresent

    def findBookDetail(self, subUrl):
        ''' This method will download book cover.
        It will provide book object.
        http://www.ebook777.com/shut-youre-welcome/
         '''
        book = None
        #         url=self.baseUrl+'/'+subUrl
        url = subUrl
        r = requests.get(url, headers=self.header_info, timeout=30)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "lxml")

            book = Book()
            book.bookDescription = soup.find(id="main-content-inner").p.text
            book.bookName = soup.find(id="main-content-inner").find(
                class_='article-details').find(class_='title').text
            book.subTitle = soup.find(id="main-content-inner").find(
                class_='article-details').find(class_='subtitle').text
            bookUrl = soup.find(id="main-content-inner").find(
                class_='download-links').find('a')['href']
            table_body = soup.find('table')
            rows = table_body.find_all('tr')
            for row in rows:
                cols = row.find_all('td')
                if len(cols) == 3:
                    book.bookImgName = cols[0].img.attrs['alt']
                    self.imageUrl = cols[0].img.attrs['src']
                    if cols[1].text == 'Author':
                        #                         print cols[2].text
                        author = Author()
                        author.authorName = cols[2].text
                        book.authors.append(author)
#                         book.authors.append()

                if len(cols) == 2:
                    if cols[0].text == 'File size':
                        book.fileSize = cols[1].text
                    if cols[0].text == 'Year':
                        try:
                            date = datetime.strptime(cols[1].text, '%Y')
                        except:
                            date = datetime.now()
                        book.publishedOn = date
                    if cols[0].text == 'Pages':
                        book.numberOfPages = cols[1].text
                    if cols[0].text == 'Language':
                        book.inLanguage = cols[1].text
                    if cols[0].text == 'File format':
                        book.bookFormat = cols[1].text
                    if cols[0].text == 'Category':
                        book.tag = cols[1].text
                    if cols[0].text == 'File format':
                        book.bookFormat = cols[1].text
                    if cols[0].text == 'Isbn':
                        book.isbn_13 = cols[1].text

#                 print cols

        return book, bookUrl

    def downloadEbook(self, book, refUrl, bookUrl):
        directory_name = self.downloadDir()
        url = refUrl
        bookImagePath = os.path.join(directory_name, book.bookImgName)
        self.downloadBookImage(bookImagePath, self.imageUrl)
        self.writeJsonToDir(directory_name, book)

        r = requests.get(bookUrl, headers=self.header_info, timeout=30)
        print '--------------->', r.url
        bookPath = os.path.join(directory_name, bookUrl.split('/')[-1])
        with open(bookPath, 'wb') as bookFile:
            bookFile.write(r.content)
        try:
            self.extractRar(directory_name)
        except:
            traceback.print_exc()
            pass
        return directory_name

    def firefoxDownloadJob(self, book, refUrl):
        '''The function of this method is to download link of given URL.'''
        # Creating directory
        directory_name = self.downloadDir()

        # Creating Actual URL
        #         url = self.baseUrl+refUrl
        url = refUrl

        lsFiles = []

        # Checking if there are three files in this URL.
        # Creating a list of absolute files.
        if 3 == len(os.listdir(directory_name)):
            for sName in os.listdir(directory_name):
                if os.path.isfile(os.path.join(directory_name, sName)):
                    lsFiles.append(sName)

        # Checking if there are more than 3 files in the directory location.
        # Removing all the files from direcotry.
        elif 3 != len(os.listdir(directory_name)):
            for sName in os.listdir(directory_name):
                os.remove(directory_name + '/' + sName)

            # Downloading book cover
            bookImagePath = os.path.join(directory_name, book.bookImgName)
            self.downloadBookImage(bookImagePath, self.imageUrl)

            # writing json file
            self.writeJsonToDir(directory_name, book)
            binary = FirefoxBinary('/docs/python_projects/firefox/firefox')

            fp = webdriver.FirefoxProfile()

            fp.set_preference("webdriver.log.file", "/tmp/firefox_console")
            fp.set_preference("browser.download.folderList", 2)
            fp.set_preference('browser.download.manager.showWhenStarting',
                              False)
            fp.set_preference('browser.download.manager.focusWhenStarting',
                              False)
            fp.set_preference("browser.download.dir", directory_name)
            fp.set_preference("browser.download.manager.scanWhenDone", False)
            fp.set_preference("browser.download.manager.useWindow", False)
            #             fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream")
            fp.set_preference(
                "browser.helperApps.neverAsk.saveToDisk",
                "application/octet-stream,application/xml,application/pdf,text/plain,text/xml,image/jpeg,text/csv,application/zip,application/x-rar-compressed"
            )
            fp.set_preference("browser.helperApps.alwaysAsk.force", False)
            fp.set_preference("browser.popups.showPopupBlocker", False)
            fp.update_preferences()
            driver = webdriver.Firefox(firefox_profile=fp,
                                       firefox_binary=binary)
            # driver.find_element_by_xpath("html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[1]/img")
            driver.get(url)
            efd_link = driver.find_element_by_css_selector(
                ".download-links > a:nth-child(1)")
            efd_link.click()

            #             efd_link.send_keys(Keys.RETURN)
            flag = True
            while (flag):
                # # checking part file
                time.sleep(10)
                lst = []
                files = []
                for sName in os.listdir(directory_name):
                    if os.path.isfile(os.path.join(directory_name, sName)):
                        lst.append(sName.split('.')[-1:][0])
                        files.append(os.path.join(directory_name, sName))
#                 print lst
                if 'part' not in lst:
                    flag = False
                    time.sleep(10)
                    driver.close()
                else:
                    # print files
                    #                     if not self.isBookDownloading(files):
                    #                         driver.close()
                    pass
        self.extractRar(directory_name)

    def downloadBookImage(self, bookImagePath=None, imageUrl=None):
        '''
        this method will download image from imageUrl location and keep it at bookImagePath
        '''
        from PIL import Image
        from StringIO import StringIO
        r = requests.get(imageUrl, headers=self.header_info, timeout=30)
        print '--------------->', r.url
        with open(bookImagePath, 'wb') as imageFile:
            imageFile.write(r.content)

    def writeJsonToDir(self, bookPath=None, book=None):
        '''
        this function will write json file to given dir.
        '''
        try:
            f = open(os.path.join(bookPath, 'book.json'), 'w')
            row2dict = book.__dict__
            authors = []
            if type(row2dict['publishedOn']) == datetime:
                row2dict['publishedOn'] = str(row2dict['publishedOn'])
            for a in row2dict['authors']:
                author = {}
                if type(a) == str:
                    author['authorName'] = a
                else:
                    author = a.__dict__

                authors.append(author)
            row2dict['authors'] = authors
            if not row2dict['isbn_13'] == None:
                if str(row2dict['isbn_13']).strip() == '':
                    row2dict['isbn_13'] = None
            f.write(json.dumps(row2dict, sort_keys=False, indent=4))
            f.close()
        except:
            traceback.print_exc()

    def isBookDownloading(self, files):
        ''' This method will inform that book is getting downloading or not.'''
        # time.sleep(2)
        dic_files = {}
        time_dic_files = {}
        i = 1
        checkFlagForSize = True
        isDownloading = True
        for fl in files:
            dic_files[fl] = str(os.stat(fl).st_size)
        while (checkFlagForSize):

            time_dic_files[i] = dic_files
            i = i + 1
            if i > 4:
                size = set()
                for k in time_dic_files[i - 1]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 1][k])
                for k in time_dic_files[i - 2]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 2][k])
                for k in time_dic_files[i - 3]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 3][k])
#                 print len(list(size))
                if len(list(size)) > 1:
                    isDownloading = False
            checkFlagForSize = False
        logging.info('isDownloading:')
        return isDownloading

    def startDownload(self):
        #         baseUrl = 'http://itebooks.website'
        #         baseUrl = 'http://it-ebooks.directory'
        baseUrl = 'http://www.ebook777.com'
        itebook = ItEbook(baseUrl)
        # TODO need to be updated
        logicTrue = True
        i = 1100
        while logicTrue:
            subUrl = 'page/' + str(i) + '/'
            itebook.findAllBookUrl(subUrl)
            i = i + 1
            print 'startDownload---------->', str(i)


#             if i==4:
#                 break

    def getMaxBookID(self):
        maxBookId = self.createDatabase.getMaxBookID()
        if not maxBookId:
            maxBookId = 0
        return maxBookId

    def downloadDir(self):
        '''
        This function will create directory to download book.
        @param number:it takes database maxId+1 to create new directory . 
        '''
        directory_name = os.path.join(self.directory_name,
                                      str(self.getMaxBookID() + 1))
        if not os.path.exists(directory_name):
            os.makedirs(directory_name)
            os.chdir(directory_name)
        return directory_name

    def extractRar(self, directory_name):
        '''
        extracting rar file
        '''
        os.chdir(directory_name)
        #         directory_name = '/docs/new/library/8006'
        listOfFiles = [
            name for name in os.listdir(directory_name)
            if not os.path.isdir(os.path.join(directory_name, name))
        ]
        for fileName in listOfFiles:
            if fileName.endswith(".rar"):
                #                 print fileName
                directory_name
                rar = rarfile.RarFile(os.path.join(directory_name, fileName))
                #                 print rar.namelist()
                infoList = rar.infolist()
                nameList = rar.namelist()
                for name in nameList:
                    if not ((name.endswith('.html')) or
                            (name.endswith('.htm')) or
                            (name.endswith('.txt'))):
                        rar.extract(name, directory_name)
        pass
示例#2
0
class ItEbook(object):
    '''
    This class downloads first page of itebookinfo
    '''
    def __init__(self, baseUrl=None):
        '''
        Constructor
        '''
        self.baseUrl = baseUrl
        self.directory_name = Workspace().libraryPath
        self.createDatabase = CreateDatabase()
        self.header_info = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0'
        }
        pass

    def getUrl(self, baseUrl):
        '''this method will find and constuct all url of url given'''
        return self.baseUrl

    def findAllBookUrl(self, subUrl=None):
        '''
        This method retrive all the book url avaialbe in the page.
        http://itebooks.website/page-2.html
        '''
        url = self.baseUrl + '/' + subUrl
        print url
        #         content = urllib2.urlopen(url).read()
        r = requests.get(url, headers=self.header_info, timeout=30)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "lxml")

            skipList = (u'\nCategories', u'\nContact', u'\nUpload',
                        u'\nDonate', u'IT eBooks', u'Prev', u'Next')
            listOfBookName = list()
            for link in soup.find_all('a'):
                if link.text.strip() != '' and link.text not in skipList:
                    listOfBookName.append(link.text)

                    isBookAvailable = self.isBookNameAvailableInDatabase(
                        link.text)
                    if not isBookAvailable:
                        print link.text, '\t', link.get(
                            'href'), isBookAvailable
                        book = self.findBookDetail(link.get('href'))
                        #                     print book
                        try:
                            print 'uploading database'
                            self.firefoxDownloadJob(book, link.get('href'))
                            self.updateDatabase()
                        except:
                            print link.get('href')
                            traceback.print_exc()

    def updateDatabase(self):
        self.createDatabase.creatingDatabase()
        self.createDatabase.addingData()

    def isBookNameAvailableInDatabase(self, bookName=None):
        isBookPresent = False
        book = self.createDatabase.findByBookName(bookName)
        if book:
            isBookPresent = True
        return isBookPresent

    def findBookDetail(self, subUrl):
        ''' This method will download book cover.
         It will provide book object.'''
        book = None
        url = self.baseUrl + '/' + subUrl
        r = requests.get(url, headers=self.header_info, timeout=30)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "lxml")

            book = Book()
            book.authors.append(
                Author(soup.find_all(itemprop="author")[0].text))
            book.isbn_10 = soup.find_all(itemprop="isbn")[0].text
            book.isbn_13 = soup.find_all(itemprop="isbn")[1].text
            book.bookName = soup.find_all(itemprop="name")[0].text
            book.publisher = soup.find_all(itemprop="publisher")[0].text

            try:
                date = datetime.strptime(
                    str(soup.find_all(itemprop="datePublished")[0].text), '%Y')
            except:
                date = datetime.now()
            book.publishedOn = date

            book.numberOfPages = soup.find_all(
                itemprop="numberOfPages")[0].text
            book.inLanguage = soup.find_all(itemprop="inLanguage")[0].text
            book.bookFormat = soup.find_all(itemprop="bookFormat")[0].text
            book.bookDescription = soup.find_all("div",
                                                 {"class": "span12"})[3].text
            print soup.find_all(itemprop="image")
            book.bookImgName = (soup.find_all(itemprop="image")[0]).get('src')
            try:
                book.subTitle = soup.find_all("div",
                                              {"class": "span12"})[1].text
            except:
                traceback.print_exc()

#             book.fileSize = soup.find_all('table')[3].find_all('tr')[7].find_all('td')[1].find_all('b')[0].text
            book.fileSize = soup.find_all(
                'table', {"class": "table table-bordered"
                          })[1].find_all('tr')[5].find_all('td')[1].text

    #         book.fileSize=

    #         .top > div:nth-child(2) > h3:nth-child(2)

#             for link in soup.find_all('a'):
#                 if link.get('href').startswith('http://filepi.com'):
#                     book.name = link.text
#                     break
        return book

    def firefoxDownloadJob(self, book, refUrl):
        '''The function of this method is to download link of given URL.'''
        # Creating directory
        directory_name = self.downloadDir()

        # Creating Actual URL
        url = self.baseUrl + refUrl

        lsFiles = []

        # Checking if there are three files in this URL.
        # Creating a list of absolute files.
        if 3 == len(os.listdir(directory_name)):
            for sName in os.listdir(directory_name):
                if os.path.isfile(os.path.join(directory_name, sName)):
                    lsFiles.append(sName)

        # Checking if there are more than 3 files in the directory location.
        # Removing all the files from direcotry.
        elif 3 != len(os.listdir(directory_name)):
            for sName in os.listdir(directory_name):
                os.remove(directory_name + '/' + sName)

            imageUrl = self.baseUrl + book.bookImgName
            subUrl = book.bookImgName
            imageFileName = subUrl.split('/')[-1:][0]

            # Downloading book cover
            bookImagePath = os.path.join(directory_name,
                                         subUrl.split('/')[-1:][0])
            #             urllib.urlretrieve(imageUrl,bookImagePath)
            from PIL import Image
            from StringIO import StringIO
            r = requests.get(imageUrl, headers=self.header_info, timeout=30)
            print '--------------->', r.url
            with open(bookImagePath, 'wb') as imageFile:
                imageFile.write(r.content)

            book.bookImgName = imageFileName
            #writing json file
            self.writeJsonToDir(directory_name, book)

            fp = webdriver.FirefoxProfile()

            fp.set_preference("webdriver.log.file", "/tmp/firefox_console")
            fp.set_preference("browser.download.folderList", 2)
            fp.set_preference('browser.download.manager.showWhenStarting',
                              True)
            fp.set_preference('browser.download.manager.focusWhenStarting',
                              True)
            fp.set_preference("browser.download.dir", directory_name)
            fp.set_preference("browser.download.manager.scanWhenDone", True)
            fp.set_preference("browser.download.manager.useWindow", True)
            fp.set_preference("browser.helperApps.neverAsk.saveToDisk",
                              "application/octet-stream")
            fp.update_preferences()
            driver = webdriver.Chrome()
            # driver.find_element_by_xpath("html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[1]/img")
            driver.get(url)
            efd_link = driver.find_element_by_id(id_='download')
            #             efd_link.click()
            efd_link.send_keys(Keys.RETURN)
            flag = True
            while (flag):
                # # checking part file
                time.sleep(10)
                lst = []
                files = []
                for sName in os.listdir(directory_name):
                    if os.path.isfile(os.path.join(directory_name, sName)):
                        lst.append(sName.split('.')[-1:][0])
                        files.append(os.path.join(directory_name, sName))
                print lst
                if 'part' not in lst:
                    flag = False
                    time.sleep(10)
                    driver.close()
                else:
                    #print files
                    #                     if not self.isBookDownloading(files):
                    #                         driver.close()
                    pass

    def writeJsonToDir(self, bookPath=None, book=None):
        '''
        this function will write json file to given dir.
        '''
        try:
            f = open(os.path.join(bookPath, 'book.json'), 'w')
            row2dict = book.__dict__
            authors = []
            if type(row2dict['publishedOn']) == datetime:
                row2dict['publishedOn'] = str(row2dict['publishedOn'])
            for a in row2dict['authors']:
                author = {}
                if type(a) == str:
                    author['authorName'] = a
                else:
                    author = a.__dict__

                authors.append(author)
            row2dict['authors'] = authors
            f.write(json.dumps(row2dict, sort_keys=False, indent=4))
            f.close()
        except:
            traceback.print_exc()

    def isBookDownloading(self, files):
        ''' This method will inform that book is getting downloading or not.'''
        #time.sleep(2)
        dic_files = {}
        time_dic_files = {}
        i = 1
        checkFlagForSize = True
        isDownloading = True
        for fl in files:
            dic_files[fl] = str(os.stat(fl).st_size)
        while (checkFlagForSize):

            time_dic_files[i] = dic_files
            i = i + 1
            if i > 4:
                size = set()
                for k in time_dic_files[i - 1]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 1][k])
                for k in time_dic_files[i - 2]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 2][k])
                for k in time_dic_files[i - 3]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 3][k])
#                 print len(list(size))
                if len(list(size)) > 1:
                    isDownloading = False
            checkFlagForSize = False
        logging.info('isDownloading:')
        return isDownloading

    def startDownload(self):
        baseUrl = 'http://itebooks.website'
        itebook = ItEbook(baseUrl)
        # TODO need to be updated
        logicTrue = True
        i = 2
        while logicTrue:
            subUrl = 'page-' + str(i) + '.html'
            itebook.findAllBookUrl(subUrl)
            i = i + 1
            print 'startDownload---------->', str(i)


#             if i==4:
#                 break

    def getMaxBookID(self):
        maxBookId = self.createDatabase.getMaxBookID()
        if not maxBookId:
            maxBookId = 0
        return maxBookId

    def downloadDir(self):
        '''
        This function will create directory to download book.
        @param number:it takes database maxId+1 to create new directory . 
        '''
        directory_name = os.path.join(self.directory_name,
                                      str(self.getMaxBookID() + 1))
        if not os.path.exists(directory_name):
            os.makedirs(directory_name)
            os.chdir(directory_name)
        return directory_name
示例#3
0
class ItEbook(object):
    '''
    This class downloads first page of itebookinfo
    '''
    def __init__(self, baseUrl=None):
        '''
        Constructor
        '''
        self.baseUrl = baseUrl
        self.directory_name = Workspace().libraryPath
        self.createDatabase = CreateDatabase()
        pass

    def getUrl(self, baseUrl):
        '''this method will find and constuct all url of url given'''
        return self.baseUrl

    def findAllBookUrl(self):
        '''
        This method retrive all the book url avaialbe in the page.
        '''
        content = urllib2.urlopen(self.baseUrl).read()
        soup = BeautifulSoup(content, "lxml")

        skipList = ('IT eBooks', 'IT eBooks Group', u'IT-eBooks.Info',
                    u'IT-eBooks API', u'IT-eBooks Search', 'Tweet')
        listOfBookName = list()
        for link in soup.find_all('a'):
            if link.text.strip() != '' and link.text not in skipList:
                listOfBookName.append(link.text)
                isBookAvailable = self.isBookNameAvailableInDatabase(link.text)
                if not isBookAvailable:
                    print link.text, '\t', link.get('href'), isBookAvailable
                    book = self.findBookDetail(link.get('href'))
                    #                     print book
                    try:
                        self.firefoxDownloadJob(book, link.get('href'))
                        self.updateDatabase()
                    except:
                        print link.get('href')
                        traceback.print_exc()

    def updateDatabase(self):
        self.createDatabase.creatingDatabase()
        self.createDatabase.addingData()

    def isBookNameAvailableInDatabase(self, bookName=None):
        isBookPresent = False
        book = self.createDatabase.findByBookName(bookName)
        if book:
            isBookPresent = True
        return isBookPresent

    def findBookDetail(self, number):
        ''' This method will download book cover.
         It will provide book object.'''

        url = self.baseUrl + number
        content = urllib2.urlopen(url).read()
        soup = BeautifulSoup(content, "lxml")
        book = Book()
        book.authors.append(Author(soup.find_all(itemprop="author")[0].text))
        book.isbn_13 = soup.find_all(itemprop="isbn")[0].text
        book.bookName = soup.find_all(itemprop="name")[0].text
        book.publisher = soup.find_all(itemprop="publisher")[0].text

        try:
            date = datetime.strptime(
                str(soup.find_all(itemprop="datePublished")[0].text), '%Y')
        except:
            date = datetime.now()
        book.publishedOn = date

        book.numberOfPages = soup.find_all(itemprop="numberOfPages")[0].text
        book.inLanguage = soup.find_all(itemprop="inLanguage")[0].text
        book.bookFormat = soup.find_all(itemprop="bookFormat")[0].text
        book.bookDescription = soup.find_all(itemprop="description")[0].text
        book.bookImgName = (soup.find_all(itemprop="image")[0]).get('src')
        try:
            book.subTitle = soup.h3.text
        except:
            traceback.print_exc()
        book.fileSize = soup.find_all('table')[3].find_all('tr')[7].find_all(
            'td')[1].find_all('b')[0].text
        #         book.fileSize=

        #         .top > div:nth-child(2) > h3:nth-child(2)

        for link in soup.find_all('a'):
            if link.get('href').startswith('http://filepi.com'):
                book.name = link.text
                break
        return book

    def firefoxDownloadJob(self, book, refUrl):
        '''The function of this method is to download link of given URL.'''
        # Creating directory
        directory_name = self.downloadDir()

        # Creating Actual URL
        url = self.baseUrl + refUrl

        lsFiles = []

        # Checking if there are three files in this URL.
        # Creating a list of absolute files.
        if 3 == len(os.listdir(directory_name)):
            for sName in os.listdir(directory_name):
                if os.path.isfile(os.path.join(directory_name, sName)):
                    lsFiles.append(sName)

        # Checking if there are more than 3 files in the directory location.
        # Removing all the files from direcotry.
        elif 3 != len(os.listdir(directory_name)):
            for sName in os.listdir(directory_name):
                os.remove(directory_name + '/' + sName)

            imageUrl = self.baseUrl + book.bookImgName
            subUrl = book.bookImgName
            imageFileName = subUrl.split('/')[-1:][0]

            # Downloading book cover
            bookImagePath = os.path.join(directory_name,
                                         subUrl.split('/')[-1:][0])
            urllib.urlretrieve(imageUrl, bookImagePath)
            book.bookImgName = imageFileName
            #writing json file
            self.writeJsonToDir(directory_name, book)
            binary = FirefoxBinary('/docs/python_projects/firefox/firefox')

            fp = webdriver.FirefoxProfile()

            fp.set_preference("webdriver.log.file", "/tmp/firefox_console")
            fp.set_preference("browser.download.folderList", 2)
            fp.set_preference('browser.download.manager.showWhenStarting',
                              False)
            fp.set_preference('browser.download.manager.focusWhenStarting',
                              False)
            fp.set_preference("browser.download.dir", directory_name)
            fp.set_preference("browser.download.manager.scanWhenDone", False)
            fp.set_preference("browser.download.manager.useWindow", False)
            fp.set_preference("browser.helperApps.neverAsk.saveToDisk",
                              "application/octet-stream")
            fp.update_preferences()
            driver = webdriver.Firefox(firefox_profile=fp,
                                       firefox_binary=binary)
            # driver.find_element_by_xpath("html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[1]/img")
            driver.get(url)
            efd_link = driver.find_element_by_link_text(book.name)
            book.fileSize = driver.find_element_by_xpath(
                "html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[2]/table/tbody/tr[8]/td[2]/b"
            ).text
            book.bookFormat = driver.find_element_by_xpath(
                "html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[2]/table/tbody/tr[9]/td[2]/b"
            ).text
            efd_link.click()
            flag = True
            while (flag):
                # # checking part file
                time.sleep(10)
                lst = []
                files = []
                for sName in os.listdir(directory_name):
                    if os.path.isfile(os.path.join(directory_name, sName)):
                        lst.append(sName.split('.')[-1:][0])
                        files.append(os.path.join(directory_name, sName))
                print lst
                if 'part' not in lst:
                    flag = False
                    time.sleep(10)
                    driver.close()
                else:
                    #print files
                    #                     if not self.isBookDownloading(files):
                    #                         driver.close()
                    pass

    def writeJsonToDir(self, bookPath=None, book=None):
        '''
        this function will write json file to given dir.
        '''
        try:
            f = open(os.path.join(bookPath, 'book.json'), 'w')
            row2dict = book.__dict__
            authors = []
            if type(row2dict['publishedOn']) == datetime:
                row2dict['publishedOn'] = str(row2dict['publishedOn'])
            for a in row2dict['authors']:
                author = {}
                if type(a) == str:
                    author['authorName'] = a
                else:
                    author = a.__dict__

                authors.append(author)
            row2dict['authors'] = authors
            f.write(json.dumps(row2dict, sort_keys=False, indent=4))
            f.close()
        except:
            traceback.print_exc()

    def isBookDownloading(self, files):
        ''' This method will inform that book is getting downloading or not.'''
        #time.sleep(2)
        dic_files = {}
        time_dic_files = {}
        i = 1
        checkFlagForSize = True
        isDownloading = True
        for fl in files:
            dic_files[fl] = str(os.stat(fl).st_size)
        while (checkFlagForSize):

            time_dic_files[i] = dic_files
            i = i + 1
            if i > 4:
                size = set()
                for k in time_dic_files[i - 1]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 1][k])
                for k in time_dic_files[i - 2]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 2][k])
                for k in time_dic_files[i - 3]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 3][k])


#                 print len(list(size))
                if len(list(size)) > 1:
                    isDownloading = False
            checkFlagForSize = False
        logging.info('isDownloading:')
        return isDownloading

    def startDownload(self):
        baseUrl = 'http://it-ebooks.info'
        itebook = ItEbook(baseUrl)
        # TODO need to be updated
        itebook.findAllBookUrl()

    def getMaxBookID(self):
        maxBookId = self.createDatabase.getMaxBookID()
        if not maxBookId:
            maxBookId = 0
        return maxBookId

    def downloadDir(self):
        '''
        This function will create directory to download book.
        @param number:it takes database maxId+1 to create new directory . 
        '''
        directory_name = os.path.join(self.directory_name,
                                      str(self.getMaxBookID() + 1))
        if not os.path.exists(directory_name):
            os.makedirs(directory_name)
            os.chdir(directory_name)
        return directory_name
示例#4
0
class FindingBook():
    '''
    This class searches book detail in Opal database.this database would be created in workspace(Opal library).
    '''

    def __init__(self, libraryPath=None):
        self.libraryPath = libraryPath
        self.createDatabase = CreateDatabase(libraryPath=libraryPath)
        pass

    def searchingBook(self, searchText=None, exactSearchFlag=False, pageSize=10, offset=0):
        '''
        This method return list of books matching with search text.
        @param searchText: may be a book name 
        '''
        books = list()
        if searchText != None and searchText != '':
            os.chdir(self.libraryPath)
            if exactSearchFlag:
                books, count = self.createDatabase.findByBookName(searchText)
            else:
                books, count = self.createDatabase.findBySimlarBookName(bookName=searchText, limit=pageSize, offset=0)
        else:
            books, count = self.findAllBooks()
        return books, count 
    
    def countAllBooks(self):
        bookCount = self.createDatabase.countAllBooks()
        return bookCount

    def findBookByNextMaxId(self, bookId=None):
        return self.createDatabase.findBookByNextMaxId(bookId)

    def findBookByPreviousMaxId(self, bookId=None):
        return self.createDatabase.findBookByPreviousMaxId(bookId)
    
    def findAllBooks(self, pageSize=None, offset=0):
        '''
        This method will give all the books list in book library.
        '''
        books = list()
        os.chdir(self.libraryPath)
        books, count = self.createDatabase.findAllBook(pageSize=pageSize, offset=offset)
        return books, count

    def findBookByIsbn(self, isbn_13):
        bs = self.createDatabase.findBookByIsbn(isbn_13)
        return bs

    def getMaxBookId(self):
        os.chdir(self.libraryPath)
    
    def deleteBook(self, book):
        '''
        removing book from database and files.
        @param book: book object 
        '''
        bookPath = book.bookPath
        isSuccessfulDatabaseDelete = self.createDatabase.removeBook(book)
        if isSuccessfulDatabaseDelete:
            BookTerminal().removeBook(bookPath=bookPath)
            
    def findFolderWithoutBook(self):
        '''
        this method will find all the folder without book.
        '''
        directory_name = self.libraryPath
        os.chdir(directory_name)
        listOfDir = [ name for name in os.listdir(directory_name) if os.path.isdir(os.path.join(directory_name, name)) ]
        if listOfDir:
            listOfDir.sort(key=int)
        defaulterList = list()
        for dir in listOfDir:
            lst = list()
            levelOne = os.path.join(directory_name, dir)
            for sName in os.listdir(levelOne):
                if os.path.isfile(os.path.join(levelOne, sName)):
                    lst.append(sName.split('.')[-1:][0])
#             if 'pdf' not in lst:
#                 defaulterList.append(levelOne)
            if len(lst) < 3:
                defaulterList.append(levelOne)
示例#5
0
class FullCircleMagazine():
    
    def __init__(self, baseUrl=None):
        self.baseUrl = baseUrl
        self.directory_name = Workspace().libraryPath
        self.createDatabase = CreateDatabase() 
        self.header_info = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0'}
        
        # book image url
        self.imageUrl = None
        self.bookUrl = None
        pass
    
    def downloadFullCircleMagazine(self, url, book=None, bookUrl=None):
        '''
        AQGPK3595C
        '''
#         url = 'http://dl.fullcirclemagazine.org/issue1_en.pdf'
#         'http://dl.fullcirclemagazine.org/issue3_en.pdf'
        directory_name = self.createDownloadDir()
        bookImagePath = os.path.join(directory_name, book.bookImgName)
        os.chdir(directory_name)
        r = requests.get(url, headers=self.header_info, timeout=30)
        if r.status_code == 200:
            print r.status_code, url
            print '------->', int(r.headers["content-length"]) / 1000000
            book.fileSize = str(round(int(r.headers["content-length"]) / 1000000 , 2)) + ' MB'
            self.writeJsonToDir(directory_name, book)
            self.downloadBookImage(bookImagePath, self.imageUrl)
#             r = requests.get(bookUrl, headers=self.header_info, timeout=30)
            print '--------------->', r.url
            bookPath = os.path.join(directory_name, url.split('/')[-1])
            print bookPath
            with open(bookPath, 'wb') as bookFile:
                
                bookFile.write(r.content)
            self.updateDatabase(directory_name)
        return r.status_code, directory_name  
    
    def createBookDetail(self, bookName=None):
        book = Book()   
        book.bookName = "Full Circle "+ bookName
        book.bookFormat = 'pdf'
        book.tag = 'Technology'
        book.inLanguage = 'English'
        book.subTitle = 'Magazine'
        book.publisher = "Full Circle"
        book.bookImgName = bookName + '.jpg'
        book.hasCover = 'Yes'
        book.hasCode = 'No'
        return book
            
    def writeJsonToDir(self, bookPath=None, book=None):
        try:
            f = open(os.path.join(bookPath, 'book.json'), 'w')
            row2dict = book.__dict__
            authors = []
            if type(row2dict['publishedOn']) == datetime:
                row2dict['publishedOn'] = str(row2dict['publishedOn'])
            for a in row2dict['authors']:
                author = {}
                if type(a) == str:
                    author['authorName'] = a
                else:
                    author = a.__dict__
                
                authors.append(author)
            row2dict['authors'] = authors
            if not row2dict['isbn_13'] == None:
                if str(row2dict['isbn_13']).strip() == '':
                    row2dict['isbn_13'] = None
            f.write(json.dumps(row2dict, sort_keys=False, indent=4))
            f.close()     
        except:
            traceback.print_exc()   
            
    def downloadBookImage(self, bookImagePath=None, imageUrl=None):
        '''
        this method will download image from imageUrl location and keep it at bookImagePath
        '''
        print imageUrl
        head, data = imageUrl.split(',', 1)
        bits = head.split(';')
        mime_type = bits[0] if bits[0] else 'text/plain'
        charset, b64 = 'ASCII', False
        for bit in bits:
            if bit.startswith('charset='):
                charset = bit[8:]
            elif bit == 'base64':
                b64 = True
        
        # Do something smart with charset and b64 instead of assuming
        plaindata = data.decode("base64")
        
        # Do something smart with mime_type
        with open(bookImagePath, 'wb') as f:
            f.write(plaindata)

        print 'write image complete'
#         from PIL import Image   
#         from StringIO import StringIO
#         r = requests.get(imageUrl, headers=self.header_info, timeout=30)
#         print '--------------->', r.url
#         with open(bookImagePath, 'wb') as imageFile:
#             imageFile.write(r.content)    


    def updateDatabase(self, directory_name):
#         self.createDatabase.creatingDatabase()  
#         self.createDatabase.addingData() 
        self.createDatabase.addSingleBookData(directory_name)
           
    def isIsbnAvailableInDatabase(self, isbn_13=None):
        isBookPresent = False
        book = self.createDatabase.findByIsbn_13Name(isbn_13)
        if book:
            isBookPresent = True
        return isBookPresent
    
    def isBookNameAvailableInDatabase(self, bookName=None):
        isBookPresent = False
        book = self.createDatabase.findByBookName(bookName)
        if book:
            isBookPresent = True
        return isBookPresent
    
    def createDownloadDir(self):
        '''
        This function will create directory to download book.
        @param number:it takes database maxId+1 to create new directory . 
        '''
        directory_name = os.path.join(self.directory_name, str(self.getMaxBookID() + 1))
        if not os.path.exists(directory_name):
            os.makedirs(directory_name,755)
            os.chdir(directory_name)
        return directory_name
    
    def getMaxBookID(self):
        maxBookId = self.createDatabase.getMaxBookID()
        if not maxBookId:
            maxBookId = 0        
        return maxBookId
    
    
    def getImageUrl(self, completeUrl, issueCount):
        print completeUrl
        imageUrl = None
        r = requests.get(completeUrl, headers=self.header_info, timeout=30)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "lxml")
#             print soup
            alt = soup.find(class_='issuetable').find('img')['alt']
            if alt == 'Cover for Issue '+issueCount+' in English':
                imageUrl = soup.find(class_='issuetable').find('img')['src']
                print imageUrl
        return imageUrl
    
    def startDownload(self):
        logic = True
        i = 1
        while logic:
            pdfUrl = 'http://dl.fullcirclemagazine.org/issue' + str(i) + '_en.pdf'
            completeUrl = 'http://fullcirclemagazine.org/issue-' + str(i) + '/'
            if not self.isIssuePresent(str(i)):
                self.imageUrl = self.getImageUrl(completeUrl,str(i))
                book = self.createBookDetail('Issue ' + str(i))
                status_code, directory_name = self.downloadFullCircleMagazine(book=book, url=pdfUrl)
                print completeUrl, status_code
                if status_code != 200:
                    logic = False
            i = i + 1
    
    
    def isIssuePresent(self, issue=None):
        isBookPresent = False
        bookName="Full Circle Issue " + issue
        book = self.createDatabase.findByBookName(bookName)
        if book:
            isBookPresent = True
        return isBookPresent

    def getIssueDetail(self):
        url='https://wiki.ubuntu.com/UbuntuMagazine/FullIssueIndex'
        r = requests.get(url, headers=self.header_info, timeout=30)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "lxml") 
            tables=soup.findAll('table')
            for table in tables:
                print table