class ItEbook(object): ''' This class downloads first page of itebookinfo ''' def __init__(self, baseUrl=None): ''' Constructor ''' self.baseUrl = baseUrl self.directory_name = Workspace().libraryPath self.createDatabase = CreateDatabase() self.header_info = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0' } # book image url self.imageUrl = None self.bookUrl = None pass def getUrl(self, baseUrl): '''this method will find and constuct all url of url given''' return self.baseUrl def findAllBookUrl(self, subUrl=None): ''' This method retrive all the book url avaialbe in the page. http://itebooks.website/page-2.html ''' url = self.baseUrl + '/' + subUrl # print url # content = urllib2.urlopen(url).read() r = requests.get(url, headers=self.header_info, timeout=30) if r.status_code == 200: soup = BeautifulSoup(r.content, "lxml") skipList = [ 'HOME', 'Category', 'Animals', 'Architecture', 'Art', 'Astronomy', 'Biography', 'Biology', 'Business', 'Chemistry', 'Cinema', 'Cookbooks', 'Cryptography', 'Culture', 'Design', 'Drawing', 'Economics', 'Encyclopedia and Dictionary', 'Engineering and Technology', 'Family and Friendship', 'Fitness', 'Gambling', 'Games', 'Hardware', 'Healthcare', 'History', 'Hobbies', 'Information Technologies', 'IT ebooks', 'Languages', 'Martial Arts', 'Mathematics', 'Medicine', 'Military', 'Music', 'Novels', 'Other', 'Personality', 'Philosophy', 'Photo', 'Physics', 'Poetry', 'Politics and Sociology', 'Programming', 'Psychology', 'Relationships', 'Religion', 'Science', 'Security', 'Sexuality', 'Software', 'Sport', 'Travel', 'Web Development' ] # with open(os.path.dirname(__file__) + os.sep + 'skipList.txt', 'r') as f: # for line in f: # skipList.append(line.rstrip('\n')) # f.close listOfBookName = list() for link in soup.find_all('a', 'title'): if link.text.strip() != '' and link.text not in skipList: listOfBookName.append(link.text) isBookAvailable = self.isBookNameAvailableInDatabase( link.text) # self.isIsbnAvailableInDatabase() # print isBookAvailable, link.text if not isBookAvailable: # print link.text, '\t', link.get('href'), isBookAvailable book, bookUrl = self.findBookDetail(link.get('href')) isBookAvailable = self.isIsbnAvailableInDatabase( book.isbn_13) # print book if not isBookAvailable: try: print 'uploading database' directory_name = self.downloadEbook( book, link.get('href'), bookUrl) self.updateDatabase(directory_name) except: print link.get('href') traceback.print_exc() def updateDatabase(self, directory_name): # self.createDatabase.creatingDatabase() # self.createDatabase.addingData() self.createDatabase.addSingleBookData(directory_name) def isIsbnAvailableInDatabase(self, isbn_13=None): isBookPresent = False book = self.createDatabase.findByIsbn_13Name(isbn_13) if book: isBookPresent = True return isBookPresent def isBookNameAvailableInDatabase(self, bookName=None): isBookPresent = False book = self.createDatabase.findByBookName(bookName) if book: isBookPresent = True return isBookPresent def findBookDetail(self, subUrl): ''' This method will download book cover. It will provide book object. http://www.ebook777.com/shut-youre-welcome/ ''' book = None # url=self.baseUrl+'/'+subUrl url = subUrl r = requests.get(url, headers=self.header_info, timeout=30) if r.status_code == 200: soup = BeautifulSoup(r.content, "lxml") book = Book() book.bookDescription = soup.find(id="main-content-inner").p.text book.bookName = soup.find(id="main-content-inner").find( class_='article-details').find(class_='title').text book.subTitle = soup.find(id="main-content-inner").find( class_='article-details').find(class_='subtitle').text bookUrl = soup.find(id="main-content-inner").find( class_='download-links').find('a')['href'] table_body = soup.find('table') rows = table_body.find_all('tr') for row in rows: cols = row.find_all('td') if len(cols) == 3: book.bookImgName = cols[0].img.attrs['alt'] self.imageUrl = cols[0].img.attrs['src'] if cols[1].text == 'Author': # print cols[2].text author = Author() author.authorName = cols[2].text book.authors.append(author) # book.authors.append() if len(cols) == 2: if cols[0].text == 'File size': book.fileSize = cols[1].text if cols[0].text == 'Year': try: date = datetime.strptime(cols[1].text, '%Y') except: date = datetime.now() book.publishedOn = date if cols[0].text == 'Pages': book.numberOfPages = cols[1].text if cols[0].text == 'Language': book.inLanguage = cols[1].text if cols[0].text == 'File format': book.bookFormat = cols[1].text if cols[0].text == 'Category': book.tag = cols[1].text if cols[0].text == 'File format': book.bookFormat = cols[1].text if cols[0].text == 'Isbn': book.isbn_13 = cols[1].text # print cols return book, bookUrl def downloadEbook(self, book, refUrl, bookUrl): directory_name = self.downloadDir() url = refUrl bookImagePath = os.path.join(directory_name, book.bookImgName) self.downloadBookImage(bookImagePath, self.imageUrl) self.writeJsonToDir(directory_name, book) r = requests.get(bookUrl, headers=self.header_info, timeout=30) print '--------------->', r.url bookPath = os.path.join(directory_name, bookUrl.split('/')[-1]) with open(bookPath, 'wb') as bookFile: bookFile.write(r.content) try: self.extractRar(directory_name) except: traceback.print_exc() pass return directory_name def firefoxDownloadJob(self, book, refUrl): '''The function of this method is to download link of given URL.''' # Creating directory directory_name = self.downloadDir() # Creating Actual URL # url = self.baseUrl+refUrl url = refUrl lsFiles = [] # Checking if there are three files in this URL. # Creating a list of absolute files. if 3 == len(os.listdir(directory_name)): for sName in os.listdir(directory_name): if os.path.isfile(os.path.join(directory_name, sName)): lsFiles.append(sName) # Checking if there are more than 3 files in the directory location. # Removing all the files from direcotry. elif 3 != len(os.listdir(directory_name)): for sName in os.listdir(directory_name): os.remove(directory_name + '/' + sName) # Downloading book cover bookImagePath = os.path.join(directory_name, book.bookImgName) self.downloadBookImage(bookImagePath, self.imageUrl) # writing json file self.writeJsonToDir(directory_name, book) binary = FirefoxBinary('/docs/python_projects/firefox/firefox') fp = webdriver.FirefoxProfile() fp.set_preference("webdriver.log.file", "/tmp/firefox_console") fp.set_preference("browser.download.folderList", 2) fp.set_preference('browser.download.manager.showWhenStarting', False) fp.set_preference('browser.download.manager.focusWhenStarting', False) fp.set_preference("browser.download.dir", directory_name) fp.set_preference("browser.download.manager.scanWhenDone", False) fp.set_preference("browser.download.manager.useWindow", False) # fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream") fp.set_preference( "browser.helperApps.neverAsk.saveToDisk", "application/octet-stream,application/xml,application/pdf,text/plain,text/xml,image/jpeg,text/csv,application/zip,application/x-rar-compressed" ) fp.set_preference("browser.helperApps.alwaysAsk.force", False) fp.set_preference("browser.popups.showPopupBlocker", False) fp.update_preferences() driver = webdriver.Firefox(firefox_profile=fp, firefox_binary=binary) # driver.find_element_by_xpath("html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[1]/img") driver.get(url) efd_link = driver.find_element_by_css_selector( ".download-links > a:nth-child(1)") efd_link.click() # efd_link.send_keys(Keys.RETURN) flag = True while (flag): # # checking part file time.sleep(10) lst = [] files = [] for sName in os.listdir(directory_name): if os.path.isfile(os.path.join(directory_name, sName)): lst.append(sName.split('.')[-1:][0]) files.append(os.path.join(directory_name, sName)) # print lst if 'part' not in lst: flag = False time.sleep(10) driver.close() else: # print files # if not self.isBookDownloading(files): # driver.close() pass self.extractRar(directory_name) def downloadBookImage(self, bookImagePath=None, imageUrl=None): ''' this method will download image from imageUrl location and keep it at bookImagePath ''' from PIL import Image from StringIO import StringIO r = requests.get(imageUrl, headers=self.header_info, timeout=30) print '--------------->', r.url with open(bookImagePath, 'wb') as imageFile: imageFile.write(r.content) def writeJsonToDir(self, bookPath=None, book=None): ''' this function will write json file to given dir. ''' try: f = open(os.path.join(bookPath, 'book.json'), 'w') row2dict = book.__dict__ authors = [] if type(row2dict['publishedOn']) == datetime: row2dict['publishedOn'] = str(row2dict['publishedOn']) for a in row2dict['authors']: author = {} if type(a) == str: author['authorName'] = a else: author = a.__dict__ authors.append(author) row2dict['authors'] = authors if not row2dict['isbn_13'] == None: if str(row2dict['isbn_13']).strip() == '': row2dict['isbn_13'] = None f.write(json.dumps(row2dict, sort_keys=False, indent=4)) f.close() except: traceback.print_exc() def isBookDownloading(self, files): ''' This method will inform that book is getting downloading or not.''' # time.sleep(2) dic_files = {} time_dic_files = {} i = 1 checkFlagForSize = True isDownloading = True for fl in files: dic_files[fl] = str(os.stat(fl).st_size) while (checkFlagForSize): time_dic_files[i] = dic_files i = i + 1 if i > 4: size = set() for k in time_dic_files[i - 1]: if 'part' in k: size.add(time_dic_files[i - 1][k]) for k in time_dic_files[i - 2]: if 'part' in k: size.add(time_dic_files[i - 2][k]) for k in time_dic_files[i - 3]: if 'part' in k: size.add(time_dic_files[i - 3][k]) # print len(list(size)) if len(list(size)) > 1: isDownloading = False checkFlagForSize = False logging.info('isDownloading:') return isDownloading def startDownload(self): # baseUrl = 'http://itebooks.website' # baseUrl = 'http://it-ebooks.directory' baseUrl = 'http://www.ebook777.com' itebook = ItEbook(baseUrl) # TODO need to be updated logicTrue = True i = 1100 while logicTrue: subUrl = 'page/' + str(i) + '/' itebook.findAllBookUrl(subUrl) i = i + 1 print 'startDownload---------->', str(i) # if i==4: # break def getMaxBookID(self): maxBookId = self.createDatabase.getMaxBookID() if not maxBookId: maxBookId = 0 return maxBookId def downloadDir(self): ''' This function will create directory to download book. @param number:it takes database maxId+1 to create new directory . ''' directory_name = os.path.join(self.directory_name, str(self.getMaxBookID() + 1)) if not os.path.exists(directory_name): os.makedirs(directory_name) os.chdir(directory_name) return directory_name def extractRar(self, directory_name): ''' extracting rar file ''' os.chdir(directory_name) # directory_name = '/docs/new/library/8006' listOfFiles = [ name for name in os.listdir(directory_name) if not os.path.isdir(os.path.join(directory_name, name)) ] for fileName in listOfFiles: if fileName.endswith(".rar"): # print fileName directory_name rar = rarfile.RarFile(os.path.join(directory_name, fileName)) # print rar.namelist() infoList = rar.infolist() nameList = rar.namelist() for name in nameList: if not ((name.endswith('.html')) or (name.endswith('.htm')) or (name.endswith('.txt'))): rar.extract(name, directory_name) pass
class ItEbook(object): ''' This class downloads first page of itebookinfo ''' def __init__(self, baseUrl=None): ''' Constructor ''' self.baseUrl = baseUrl self.directory_name = Workspace().libraryPath self.createDatabase = CreateDatabase() self.header_info = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0' } pass def getUrl(self, baseUrl): '''this method will find and constuct all url of url given''' return self.baseUrl def findAllBookUrl(self, subUrl=None): ''' This method retrive all the book url avaialbe in the page. http://itebooks.website/page-2.html ''' url = self.baseUrl + '/' + subUrl print url # content = urllib2.urlopen(url).read() r = requests.get(url, headers=self.header_info, timeout=30) if r.status_code == 200: soup = BeautifulSoup(r.content, "lxml") skipList = (u'\nCategories', u'\nContact', u'\nUpload', u'\nDonate', u'IT eBooks', u'Prev', u'Next') listOfBookName = list() for link in soup.find_all('a'): if link.text.strip() != '' and link.text not in skipList: listOfBookName.append(link.text) isBookAvailable = self.isBookNameAvailableInDatabase( link.text) if not isBookAvailable: print link.text, '\t', link.get( 'href'), isBookAvailable book = self.findBookDetail(link.get('href')) # print book try: print 'uploading database' self.firefoxDownloadJob(book, link.get('href')) self.updateDatabase() except: print link.get('href') traceback.print_exc() def updateDatabase(self): self.createDatabase.creatingDatabase() self.createDatabase.addingData() def isBookNameAvailableInDatabase(self, bookName=None): isBookPresent = False book = self.createDatabase.findByBookName(bookName) if book: isBookPresent = True return isBookPresent def findBookDetail(self, subUrl): ''' This method will download book cover. It will provide book object.''' book = None url = self.baseUrl + '/' + subUrl r = requests.get(url, headers=self.header_info, timeout=30) if r.status_code == 200: soup = BeautifulSoup(r.content, "lxml") book = Book() book.authors.append( Author(soup.find_all(itemprop="author")[0].text)) book.isbn_10 = soup.find_all(itemprop="isbn")[0].text book.isbn_13 = soup.find_all(itemprop="isbn")[1].text book.bookName = soup.find_all(itemprop="name")[0].text book.publisher = soup.find_all(itemprop="publisher")[0].text try: date = datetime.strptime( str(soup.find_all(itemprop="datePublished")[0].text), '%Y') except: date = datetime.now() book.publishedOn = date book.numberOfPages = soup.find_all( itemprop="numberOfPages")[0].text book.inLanguage = soup.find_all(itemprop="inLanguage")[0].text book.bookFormat = soup.find_all(itemprop="bookFormat")[0].text book.bookDescription = soup.find_all("div", {"class": "span12"})[3].text print soup.find_all(itemprop="image") book.bookImgName = (soup.find_all(itemprop="image")[0]).get('src') try: book.subTitle = soup.find_all("div", {"class": "span12"})[1].text except: traceback.print_exc() # book.fileSize = soup.find_all('table')[3].find_all('tr')[7].find_all('td')[1].find_all('b')[0].text book.fileSize = soup.find_all( 'table', {"class": "table table-bordered" })[1].find_all('tr')[5].find_all('td')[1].text # book.fileSize= # .top > div:nth-child(2) > h3:nth-child(2) # for link in soup.find_all('a'): # if link.get('href').startswith('http://filepi.com'): # book.name = link.text # break return book def firefoxDownloadJob(self, book, refUrl): '''The function of this method is to download link of given URL.''' # Creating directory directory_name = self.downloadDir() # Creating Actual URL url = self.baseUrl + refUrl lsFiles = [] # Checking if there are three files in this URL. # Creating a list of absolute files. if 3 == len(os.listdir(directory_name)): for sName in os.listdir(directory_name): if os.path.isfile(os.path.join(directory_name, sName)): lsFiles.append(sName) # Checking if there are more than 3 files in the directory location. # Removing all the files from direcotry. elif 3 != len(os.listdir(directory_name)): for sName in os.listdir(directory_name): os.remove(directory_name + '/' + sName) imageUrl = self.baseUrl + book.bookImgName subUrl = book.bookImgName imageFileName = subUrl.split('/')[-1:][0] # Downloading book cover bookImagePath = os.path.join(directory_name, subUrl.split('/')[-1:][0]) # urllib.urlretrieve(imageUrl,bookImagePath) from PIL import Image from StringIO import StringIO r = requests.get(imageUrl, headers=self.header_info, timeout=30) print '--------------->', r.url with open(bookImagePath, 'wb') as imageFile: imageFile.write(r.content) book.bookImgName = imageFileName #writing json file self.writeJsonToDir(directory_name, book) fp = webdriver.FirefoxProfile() fp.set_preference("webdriver.log.file", "/tmp/firefox_console") fp.set_preference("browser.download.folderList", 2) fp.set_preference('browser.download.manager.showWhenStarting', True) fp.set_preference('browser.download.manager.focusWhenStarting', True) fp.set_preference("browser.download.dir", directory_name) fp.set_preference("browser.download.manager.scanWhenDone", True) fp.set_preference("browser.download.manager.useWindow", True) fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream") fp.update_preferences() driver = webdriver.Chrome() # driver.find_element_by_xpath("html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[1]/img") driver.get(url) efd_link = driver.find_element_by_id(id_='download') # efd_link.click() efd_link.send_keys(Keys.RETURN) flag = True while (flag): # # checking part file time.sleep(10) lst = [] files = [] for sName in os.listdir(directory_name): if os.path.isfile(os.path.join(directory_name, sName)): lst.append(sName.split('.')[-1:][0]) files.append(os.path.join(directory_name, sName)) print lst if 'part' not in lst: flag = False time.sleep(10) driver.close() else: #print files # if not self.isBookDownloading(files): # driver.close() pass def writeJsonToDir(self, bookPath=None, book=None): ''' this function will write json file to given dir. ''' try: f = open(os.path.join(bookPath, 'book.json'), 'w') row2dict = book.__dict__ authors = [] if type(row2dict['publishedOn']) == datetime: row2dict['publishedOn'] = str(row2dict['publishedOn']) for a in row2dict['authors']: author = {} if type(a) == str: author['authorName'] = a else: author = a.__dict__ authors.append(author) row2dict['authors'] = authors f.write(json.dumps(row2dict, sort_keys=False, indent=4)) f.close() except: traceback.print_exc() def isBookDownloading(self, files): ''' This method will inform that book is getting downloading or not.''' #time.sleep(2) dic_files = {} time_dic_files = {} i = 1 checkFlagForSize = True isDownloading = True for fl in files: dic_files[fl] = str(os.stat(fl).st_size) while (checkFlagForSize): time_dic_files[i] = dic_files i = i + 1 if i > 4: size = set() for k in time_dic_files[i - 1]: if 'part' in k: size.add(time_dic_files[i - 1][k]) for k in time_dic_files[i - 2]: if 'part' in k: size.add(time_dic_files[i - 2][k]) for k in time_dic_files[i - 3]: if 'part' in k: size.add(time_dic_files[i - 3][k]) # print len(list(size)) if len(list(size)) > 1: isDownloading = False checkFlagForSize = False logging.info('isDownloading:') return isDownloading def startDownload(self): baseUrl = 'http://itebooks.website' itebook = ItEbook(baseUrl) # TODO need to be updated logicTrue = True i = 2 while logicTrue: subUrl = 'page-' + str(i) + '.html' itebook.findAllBookUrl(subUrl) i = i + 1 print 'startDownload---------->', str(i) # if i==4: # break def getMaxBookID(self): maxBookId = self.createDatabase.getMaxBookID() if not maxBookId: maxBookId = 0 return maxBookId def downloadDir(self): ''' This function will create directory to download book. @param number:it takes database maxId+1 to create new directory . ''' directory_name = os.path.join(self.directory_name, str(self.getMaxBookID() + 1)) if not os.path.exists(directory_name): os.makedirs(directory_name) os.chdir(directory_name) return directory_name
class ItEbook(object): ''' This class downloads first page of itebookinfo ''' def __init__(self, baseUrl=None): ''' Constructor ''' self.baseUrl = baseUrl self.directory_name = Workspace().libraryPath self.createDatabase = CreateDatabase() pass def getUrl(self, baseUrl): '''this method will find and constuct all url of url given''' return self.baseUrl def findAllBookUrl(self): ''' This method retrive all the book url avaialbe in the page. ''' content = urllib2.urlopen(self.baseUrl).read() soup = BeautifulSoup(content, "lxml") skipList = ('IT eBooks', 'IT eBooks Group', u'IT-eBooks.Info', u'IT-eBooks API', u'IT-eBooks Search', 'Tweet') listOfBookName = list() for link in soup.find_all('a'): if link.text.strip() != '' and link.text not in skipList: listOfBookName.append(link.text) isBookAvailable = self.isBookNameAvailableInDatabase(link.text) if not isBookAvailable: print link.text, '\t', link.get('href'), isBookAvailable book = self.findBookDetail(link.get('href')) # print book try: self.firefoxDownloadJob(book, link.get('href')) self.updateDatabase() except: print link.get('href') traceback.print_exc() def updateDatabase(self): self.createDatabase.creatingDatabase() self.createDatabase.addingData() def isBookNameAvailableInDatabase(self, bookName=None): isBookPresent = False book = self.createDatabase.findByBookName(bookName) if book: isBookPresent = True return isBookPresent def findBookDetail(self, number): ''' This method will download book cover. It will provide book object.''' url = self.baseUrl + number content = urllib2.urlopen(url).read() soup = BeautifulSoup(content, "lxml") book = Book() book.authors.append(Author(soup.find_all(itemprop="author")[0].text)) book.isbn_13 = soup.find_all(itemprop="isbn")[0].text book.bookName = soup.find_all(itemprop="name")[0].text book.publisher = soup.find_all(itemprop="publisher")[0].text try: date = datetime.strptime( str(soup.find_all(itemprop="datePublished")[0].text), '%Y') except: date = datetime.now() book.publishedOn = date book.numberOfPages = soup.find_all(itemprop="numberOfPages")[0].text book.inLanguage = soup.find_all(itemprop="inLanguage")[0].text book.bookFormat = soup.find_all(itemprop="bookFormat")[0].text book.bookDescription = soup.find_all(itemprop="description")[0].text book.bookImgName = (soup.find_all(itemprop="image")[0]).get('src') try: book.subTitle = soup.h3.text except: traceback.print_exc() book.fileSize = soup.find_all('table')[3].find_all('tr')[7].find_all( 'td')[1].find_all('b')[0].text # book.fileSize= # .top > div:nth-child(2) > h3:nth-child(2) for link in soup.find_all('a'): if link.get('href').startswith('http://filepi.com'): book.name = link.text break return book def firefoxDownloadJob(self, book, refUrl): '''The function of this method is to download link of given URL.''' # Creating directory directory_name = self.downloadDir() # Creating Actual URL url = self.baseUrl + refUrl lsFiles = [] # Checking if there are three files in this URL. # Creating a list of absolute files. if 3 == len(os.listdir(directory_name)): for sName in os.listdir(directory_name): if os.path.isfile(os.path.join(directory_name, sName)): lsFiles.append(sName) # Checking if there are more than 3 files in the directory location. # Removing all the files from direcotry. elif 3 != len(os.listdir(directory_name)): for sName in os.listdir(directory_name): os.remove(directory_name + '/' + sName) imageUrl = self.baseUrl + book.bookImgName subUrl = book.bookImgName imageFileName = subUrl.split('/')[-1:][0] # Downloading book cover bookImagePath = os.path.join(directory_name, subUrl.split('/')[-1:][0]) urllib.urlretrieve(imageUrl, bookImagePath) book.bookImgName = imageFileName #writing json file self.writeJsonToDir(directory_name, book) binary = FirefoxBinary('/docs/python_projects/firefox/firefox') fp = webdriver.FirefoxProfile() fp.set_preference("webdriver.log.file", "/tmp/firefox_console") fp.set_preference("browser.download.folderList", 2) fp.set_preference('browser.download.manager.showWhenStarting', False) fp.set_preference('browser.download.manager.focusWhenStarting', False) fp.set_preference("browser.download.dir", directory_name) fp.set_preference("browser.download.manager.scanWhenDone", False) fp.set_preference("browser.download.manager.useWindow", False) fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream") fp.update_preferences() driver = webdriver.Firefox(firefox_profile=fp, firefox_binary=binary) # driver.find_element_by_xpath("html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[1]/img") driver.get(url) efd_link = driver.find_element_by_link_text(book.name) book.fileSize = driver.find_element_by_xpath( "html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[2]/table/tbody/tr[8]/td[2]/b" ).text book.bookFormat = driver.find_element_by_xpath( "html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[2]/table/tbody/tr[9]/td[2]/b" ).text efd_link.click() flag = True while (flag): # # checking part file time.sleep(10) lst = [] files = [] for sName in os.listdir(directory_name): if os.path.isfile(os.path.join(directory_name, sName)): lst.append(sName.split('.')[-1:][0]) files.append(os.path.join(directory_name, sName)) print lst if 'part' not in lst: flag = False time.sleep(10) driver.close() else: #print files # if not self.isBookDownloading(files): # driver.close() pass def writeJsonToDir(self, bookPath=None, book=None): ''' this function will write json file to given dir. ''' try: f = open(os.path.join(bookPath, 'book.json'), 'w') row2dict = book.__dict__ authors = [] if type(row2dict['publishedOn']) == datetime: row2dict['publishedOn'] = str(row2dict['publishedOn']) for a in row2dict['authors']: author = {} if type(a) == str: author['authorName'] = a else: author = a.__dict__ authors.append(author) row2dict['authors'] = authors f.write(json.dumps(row2dict, sort_keys=False, indent=4)) f.close() except: traceback.print_exc() def isBookDownloading(self, files): ''' This method will inform that book is getting downloading or not.''' #time.sleep(2) dic_files = {} time_dic_files = {} i = 1 checkFlagForSize = True isDownloading = True for fl in files: dic_files[fl] = str(os.stat(fl).st_size) while (checkFlagForSize): time_dic_files[i] = dic_files i = i + 1 if i > 4: size = set() for k in time_dic_files[i - 1]: if 'part' in k: size.add(time_dic_files[i - 1][k]) for k in time_dic_files[i - 2]: if 'part' in k: size.add(time_dic_files[i - 2][k]) for k in time_dic_files[i - 3]: if 'part' in k: size.add(time_dic_files[i - 3][k]) # print len(list(size)) if len(list(size)) > 1: isDownloading = False checkFlagForSize = False logging.info('isDownloading:') return isDownloading def startDownload(self): baseUrl = 'http://it-ebooks.info' itebook = ItEbook(baseUrl) # TODO need to be updated itebook.findAllBookUrl() def getMaxBookID(self): maxBookId = self.createDatabase.getMaxBookID() if not maxBookId: maxBookId = 0 return maxBookId def downloadDir(self): ''' This function will create directory to download book. @param number:it takes database maxId+1 to create new directory . ''' directory_name = os.path.join(self.directory_name, str(self.getMaxBookID() + 1)) if not os.path.exists(directory_name): os.makedirs(directory_name) os.chdir(directory_name) return directory_name
class FindingBook(): ''' This class searches book detail in Opal database.this database would be created in workspace(Opal library). ''' def __init__(self, libraryPath=None): self.libraryPath = libraryPath self.createDatabase = CreateDatabase(libraryPath=libraryPath) pass def searchingBook(self, searchText=None, exactSearchFlag=False, pageSize=10, offset=0): ''' This method return list of books matching with search text. @param searchText: may be a book name ''' books = list() if searchText != None and searchText != '': os.chdir(self.libraryPath) if exactSearchFlag: books, count = self.createDatabase.findByBookName(searchText) else: books, count = self.createDatabase.findBySimlarBookName(bookName=searchText, limit=pageSize, offset=0) else: books, count = self.findAllBooks() return books, count def countAllBooks(self): bookCount = self.createDatabase.countAllBooks() return bookCount def findBookByNextMaxId(self, bookId=None): return self.createDatabase.findBookByNextMaxId(bookId) def findBookByPreviousMaxId(self, bookId=None): return self.createDatabase.findBookByPreviousMaxId(bookId) def findAllBooks(self, pageSize=None, offset=0): ''' This method will give all the books list in book library. ''' books = list() os.chdir(self.libraryPath) books, count = self.createDatabase.findAllBook(pageSize=pageSize, offset=offset) return books, count def findBookByIsbn(self, isbn_13): bs = self.createDatabase.findBookByIsbn(isbn_13) return bs def getMaxBookId(self): os.chdir(self.libraryPath) def deleteBook(self, book): ''' removing book from database and files. @param book: book object ''' bookPath = book.bookPath isSuccessfulDatabaseDelete = self.createDatabase.removeBook(book) if isSuccessfulDatabaseDelete: BookTerminal().removeBook(bookPath=bookPath) def findFolderWithoutBook(self): ''' this method will find all the folder without book. ''' directory_name = self.libraryPath os.chdir(directory_name) listOfDir = [ name for name in os.listdir(directory_name) if os.path.isdir(os.path.join(directory_name, name)) ] if listOfDir: listOfDir.sort(key=int) defaulterList = list() for dir in listOfDir: lst = list() levelOne = os.path.join(directory_name, dir) for sName in os.listdir(levelOne): if os.path.isfile(os.path.join(levelOne, sName)): lst.append(sName.split('.')[-1:][0]) # if 'pdf' not in lst: # defaulterList.append(levelOne) if len(lst) < 3: defaulterList.append(levelOne)
class FullCircleMagazine(): def __init__(self, baseUrl=None): self.baseUrl = baseUrl self.directory_name = Workspace().libraryPath self.createDatabase = CreateDatabase() self.header_info = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0'} # book image url self.imageUrl = None self.bookUrl = None pass def downloadFullCircleMagazine(self, url, book=None, bookUrl=None): ''' AQGPK3595C ''' # url = 'http://dl.fullcirclemagazine.org/issue1_en.pdf' # 'http://dl.fullcirclemagazine.org/issue3_en.pdf' directory_name = self.createDownloadDir() bookImagePath = os.path.join(directory_name, book.bookImgName) os.chdir(directory_name) r = requests.get(url, headers=self.header_info, timeout=30) if r.status_code == 200: print r.status_code, url print '------->', int(r.headers["content-length"]) / 1000000 book.fileSize = str(round(int(r.headers["content-length"]) / 1000000 , 2)) + ' MB' self.writeJsonToDir(directory_name, book) self.downloadBookImage(bookImagePath, self.imageUrl) # r = requests.get(bookUrl, headers=self.header_info, timeout=30) print '--------------->', r.url bookPath = os.path.join(directory_name, url.split('/')[-1]) print bookPath with open(bookPath, 'wb') as bookFile: bookFile.write(r.content) self.updateDatabase(directory_name) return r.status_code, directory_name def createBookDetail(self, bookName=None): book = Book() book.bookName = "Full Circle "+ bookName book.bookFormat = 'pdf' book.tag = 'Technology' book.inLanguage = 'English' book.subTitle = 'Magazine' book.publisher = "Full Circle" book.bookImgName = bookName + '.jpg' book.hasCover = 'Yes' book.hasCode = 'No' return book def writeJsonToDir(self, bookPath=None, book=None): try: f = open(os.path.join(bookPath, 'book.json'), 'w') row2dict = book.__dict__ authors = [] if type(row2dict['publishedOn']) == datetime: row2dict['publishedOn'] = str(row2dict['publishedOn']) for a in row2dict['authors']: author = {} if type(a) == str: author['authorName'] = a else: author = a.__dict__ authors.append(author) row2dict['authors'] = authors if not row2dict['isbn_13'] == None: if str(row2dict['isbn_13']).strip() == '': row2dict['isbn_13'] = None f.write(json.dumps(row2dict, sort_keys=False, indent=4)) f.close() except: traceback.print_exc() def downloadBookImage(self, bookImagePath=None, imageUrl=None): ''' this method will download image from imageUrl location and keep it at bookImagePath ''' print imageUrl head, data = imageUrl.split(',', 1) bits = head.split(';') mime_type = bits[0] if bits[0] else 'text/plain' charset, b64 = 'ASCII', False for bit in bits: if bit.startswith('charset='): charset = bit[8:] elif bit == 'base64': b64 = True # Do something smart with charset and b64 instead of assuming plaindata = data.decode("base64") # Do something smart with mime_type with open(bookImagePath, 'wb') as f: f.write(plaindata) print 'write image complete' # from PIL import Image # from StringIO import StringIO # r = requests.get(imageUrl, headers=self.header_info, timeout=30) # print '--------------->', r.url # with open(bookImagePath, 'wb') as imageFile: # imageFile.write(r.content) def updateDatabase(self, directory_name): # self.createDatabase.creatingDatabase() # self.createDatabase.addingData() self.createDatabase.addSingleBookData(directory_name) def isIsbnAvailableInDatabase(self, isbn_13=None): isBookPresent = False book = self.createDatabase.findByIsbn_13Name(isbn_13) if book: isBookPresent = True return isBookPresent def isBookNameAvailableInDatabase(self, bookName=None): isBookPresent = False book = self.createDatabase.findByBookName(bookName) if book: isBookPresent = True return isBookPresent def createDownloadDir(self): ''' This function will create directory to download book. @param number:it takes database maxId+1 to create new directory . ''' directory_name = os.path.join(self.directory_name, str(self.getMaxBookID() + 1)) if not os.path.exists(directory_name): os.makedirs(directory_name,755) os.chdir(directory_name) return directory_name def getMaxBookID(self): maxBookId = self.createDatabase.getMaxBookID() if not maxBookId: maxBookId = 0 return maxBookId def getImageUrl(self, completeUrl, issueCount): print completeUrl imageUrl = None r = requests.get(completeUrl, headers=self.header_info, timeout=30) if r.status_code == 200: soup = BeautifulSoup(r.content, "lxml") # print soup alt = soup.find(class_='issuetable').find('img')['alt'] if alt == 'Cover for Issue '+issueCount+' in English': imageUrl = soup.find(class_='issuetable').find('img')['src'] print imageUrl return imageUrl def startDownload(self): logic = True i = 1 while logic: pdfUrl = 'http://dl.fullcirclemagazine.org/issue' + str(i) + '_en.pdf' completeUrl = 'http://fullcirclemagazine.org/issue-' + str(i) + '/' if not self.isIssuePresent(str(i)): self.imageUrl = self.getImageUrl(completeUrl,str(i)) book = self.createBookDetail('Issue ' + str(i)) status_code, directory_name = self.downloadFullCircleMagazine(book=book, url=pdfUrl) print completeUrl, status_code if status_code != 200: logic = False i = i + 1 def isIssuePresent(self, issue=None): isBookPresent = False bookName="Full Circle Issue " + issue book = self.createDatabase.findByBookName(bookName) if book: isBookPresent = True return isBookPresent def getIssueDetail(self): url='https://wiki.ubuntu.com/UbuntuMagazine/FullIssueIndex' r = requests.get(url, headers=self.header_info, timeout=30) if r.status_code == 200: soup = BeautifulSoup(r.content, "lxml") tables=soup.findAll('table') for table in tables: print table