class DownloadItEbook(threading.Thread): ''' This class will download books from itebook.info ''' def __init__(self, group=None, target=None, name=None, args=(), kwargs=None, verbose=None): ''' Constructor, setting location of downloaded book. ''' super(DownloadItEbook, self).__init__(group=group, target=target, name=name, verbose=verbose) self.args = args self.kwargs = kwargs self.directory_name = Workspace().libraryPath self.createDatabase = CreateDatabase() pass def run(self): print('running with %s and %s', self.args, self.kwargs) return def getUrl(self, baseUrl, number): '''this method will find and constuct all url of url given''' return baseUrl + '/book/' + str(number) def findBookDetail(self, baseUrl, number): ''' This method will download book cover. It will provide book object.''' url = self.getUrl(baseUrl, number) content = urllib2.urlopen(url).read() soup = BeautifulSoup(content) book = Book() book.authors.append(Author(soup.find_all(itemprop="author")[0].text)) book.isbn_13 = soup.find_all(itemprop="isbn")[0].text book.bookName = soup.find_all(itemprop="name")[0].text book.publisher = soup.find_all(itemprop="publisher")[0].text try: date = datetime.strptime( str(soup.find_all(itemprop="datePublished")[0].text), '%Y') except: date = datetime.now() book.publishedOn = date book.numberOfPages = soup.find_all(itemprop="numberOfPages")[0].text book.inLanguage = soup.find_all(itemprop="inLanguage")[0].text book.bookFormat = soup.find_all(itemprop="bookFormat")[0].text book.bookDescription = soup.find_all(itemprop="description")[0].text book.bookImgName = (soup.find_all(itemprop="image")[0]).get('src') try: book.subTitle = soup.h3.text except: traceback.print_exc() book.fileSize = soup.find_all('table')[3].find_all('tr')[7].find_all( 'td')[1].find_all('b')[0].text # book.fileSize= # .top > div:nth-child(2) > h3:nth-child(2) for link in soup.find_all('a'): if link.get('href').startswith('http://filepi.com'): book.name = link.text break return book def getMaxBookID(self): maxBookId = self.createDatabase.getMaxBookID() if not maxBookId: maxBookId = 0 return maxBookId def downloadDir(self): ''' This function will create directory to download book. @param number:it takes database maxId+1 to create new directory . ''' directory_name = os.path.join(self.directory_name, str(self.getMaxBookID() + 1)) if not os.path.exists(directory_name): os.makedirs(directory_name) return directory_name def firefoxDownloadJob(self, book, baseUrl, number): '''The function of this method is to download link of given URL.''' directory_name = self.downloadDir() # Creating Actual URL url = self.getUrl(baseUrl, number) if not os.path.exists(directory_name): os.makedirs(directory_name) lsFiles = [] # Checking if there are three files in this URL. # Creating a list of absolute files. if 3 == len(os.listdir(directory_name)): for sName in os.listdir(directory_name): if os.path.isfile(os.path.join(directory_name, sName)): lsFiles.append(sName) # Checking if there are more than 3 files in the directory location. # Removing all the files from direcotry. elif 3 != len(os.listdir(directory_name)): for sName in os.listdir(directory_name): os.remove(directory_name + '/' + sName) imageUrl = url + book.bookImgName subUrl = book.bookImgName imageFileName = subUrl.split('/')[-1:][0] logging.info(imageUrl) # Downloading book cover bookImagePath = os.path.join(directory_name, subUrl.split('/')[-1:][0]) urllib.urlretrieve(baseUrl + book.bookImgName, bookImagePath) book.bookImgName = imageFileName f = open(os.path.join(directory_name, 'book.json'), 'w') row2dict = book.__dict__ authors = [] if type(row2dict['publishedOn']) == datetime: row2dict['publishedOn'] = str(row2dict['publishedOn']) for a in row2dict['authors']: author = {} if type(a) == str: author['authorName'] = a else: author = a.__dict__ authors.append(author) row2dict['authors'] = authors f.write(json.dumps(row2dict, sort_keys=False, indent=4)) f.close() fp = webdriver.FirefoxProfile() fp.set_preference("browser.download.folderList", 2) fp.set_preference('browser.download.manager.showWhenStarting', False) fp.set_preference('browser.download.manager.focusWhenStarting', False) fp.set_preference("browser.download.dir", directory_name) fp.set_preference("browser.download.manager.scanWhenDone", False) fp.set_preference("browser.download.manager.useWindow", False) fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream") fp.update_preferences() driver = webdriver.Firefox(firefox_profile=fp) # driver.find_element_by_xpath("html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[1]/img") driver.get(url) efd_link = driver.find_element_by_link_text(book.name) book.fileSize = driver.find_element_by_xpath( "html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[2]/table/tbody/tr[8]/td[2]/b" ).text book.bookFormat = driver.find_element_by_xpath( "html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[2]/table/tbody/tr[9]/td[2]/b" ).text efd_link.click() flag = True while (flag): # # checking part file time.sleep(10) lst = [] files = [] for sName in os.listdir(directory_name): if os.path.isfile(os.path.join(directory_name, sName)): logging.info(sName.split('.')[-1:][0]) lst.append(sName.split('.')[-1:][0]) files.append(os.path.join(directory_name, sName)) print lst if 'part' not in lst: logging.info("flag :" + str(flag)) flag = False time.sleep(10) driver.close() else: # print files # if not self.isBookDownloading(files): # driver.close() pass def writeJsonToDir(self, bookPath=None, book=None): ''' this function will write json file to given dir. ''' try: f = open(os.path.join(bookPath, 'book.json'), 'w') row2dict = book.__dict__ authors = [] if type(row2dict['publishedOn']) == datetime: row2dict['publishedOn'] = str(row2dict['publishedOn']) for a in row2dict['authors']: author = {} if type(a) == str: author['authorName'] = a else: author = a.__dict__ authors.append(author) row2dict['authors'] = authors f.write(json.dumps(row2dict, sort_keys=False, indent=4)) f.close() except: traceback.print_exc() def isBookDownloading(self, files): ''' This method will inform that book is getting downloading or not.''' # time.sleep(2) dic_files = {} time_dic_files = {} i = 1 checkFlagForSize = True isDownloading = True for fl in files: dic_files[fl] = str(os.stat(fl).st_size) while (checkFlagForSize): time_dic_files[i] = dic_files i = i + 1 if i > 4: size = set() for k in time_dic_files[i - 1]: if 'part' in k: size.add(time_dic_files[i - 1][k]) for k in time_dic_files[i - 2]: if 'part' in k: size.add(time_dic_files[i - 2][k]) for k in time_dic_files[i - 3]: if 'part' in k: size.add(time_dic_files[i - 3][k]) # print len(list(size)) if len(list(size)) > 1: isDownloading = False checkFlagForSize = False logging.info('isDownloading:') return isDownloading def startDownload(self): baseUrl = 'http://it-ebooks.info' miss = Missing() # lst = miss.missingNumbers() lst = [1464348534, 7102] for number in lst: print number # for number in range(6998, 0, -1): itebook = DownloadItEbook() url = itebook.getUrl(baseUrl, number) a = urllib2.urlopen(url) strig = a.geturl() if '404' != strig[-4:-1]: book = itebook.findBookDetail(baseUrl, number) # Is this book already availble (downloaded) # check book whethere it is existing in database. bs = FindingBook().findBookByIsbn(isbn_13=book.isbn_13) if bs: print 'this books is already present.', book.isbn_13, book.bookName else: try: self.firefoxDownloadJob(book, baseUrl, number) self.updateDatabase() except: print number, baseUrl traceback.print_exc() # try: # thread.start_new_thread( self.updateDatabase, ()) # except: # traceback.print_exc() # logging.info("checking Is this book already availble (downloaded)" + book.bookName) def updateDatabase(self): self.createDatabase.creatingDatabase() self.createDatabase.addingData() def updateBooksMetadata(self): miss = Missing() listOfDir = miss.availableNumbers() listOfDir = listOfDir[1391:] baseUrl = 'http://it-ebooks.info' for number in listOfDir: print '------------------->', number # url = self.getUrl(baseUrl, number) # a = urllib2.urlopen(url) # strig = a.geturl() # if '404' != strig[-4:-1]: # number=7102 # genUrl=self.downloadItEbook.getUrl(baseUrl, number) try: book = self.findBookDetail(baseUrl, number) book.itEbookUrlNumber = number subUrl = book.bookImgName imageFileName = subUrl.split('/')[-1:][0] book.bookImgName = imageFileName bookPath = os.path.join(Workspace().libraryPath, number) self.writeJsonToDir(bookPath, book) except: traceback.print_exc()
class ItEbook(object): ''' This class downloads first page of itebookinfo ''' def __init__(self, baseUrl=None): ''' Constructor ''' self.baseUrl = baseUrl self.directory_name = Workspace().libraryPath self.createDatabase = CreateDatabase() self.header_info = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0' } # book image url self.imageUrl = None self.bookUrl = None pass def getUrl(self, baseUrl): '''this method will find and constuct all url of url given''' return self.baseUrl def findAllBookUrl(self, subUrl=None): ''' This method retrive all the book url avaialbe in the page. http://itebooks.website/page-2.html ''' url = self.baseUrl + '/' + subUrl # print url # content = urllib2.urlopen(url).read() r = requests.get(url, headers=self.header_info, timeout=30) if r.status_code == 200: soup = BeautifulSoup(r.content, "lxml") skipList = [ 'HOME', 'Category', 'Animals', 'Architecture', 'Art', 'Astronomy', 'Biography', 'Biology', 'Business', 'Chemistry', 'Cinema', 'Cookbooks', 'Cryptography', 'Culture', 'Design', 'Drawing', 'Economics', 'Encyclopedia and Dictionary', 'Engineering and Technology', 'Family and Friendship', 'Fitness', 'Gambling', 'Games', 'Hardware', 'Healthcare', 'History', 'Hobbies', 'Information Technologies', 'IT ebooks', 'Languages', 'Martial Arts', 'Mathematics', 'Medicine', 'Military', 'Music', 'Novels', 'Other', 'Personality', 'Philosophy', 'Photo', 'Physics', 'Poetry', 'Politics and Sociology', 'Programming', 'Psychology', 'Relationships', 'Religion', 'Science', 'Security', 'Sexuality', 'Software', 'Sport', 'Travel', 'Web Development' ] # with open(os.path.dirname(__file__) + os.sep + 'skipList.txt', 'r') as f: # for line in f: # skipList.append(line.rstrip('\n')) # f.close listOfBookName = list() for link in soup.find_all('a', 'title'): if link.text.strip() != '' and link.text not in skipList: listOfBookName.append(link.text) isBookAvailable = self.isBookNameAvailableInDatabase( link.text) # self.isIsbnAvailableInDatabase() # print isBookAvailable, link.text if not isBookAvailable: # print link.text, '\t', link.get('href'), isBookAvailable book, bookUrl = self.findBookDetail(link.get('href')) isBookAvailable = self.isIsbnAvailableInDatabase( book.isbn_13) # print book if not isBookAvailable: try: print 'uploading database' directory_name = self.downloadEbook( book, link.get('href'), bookUrl) self.updateDatabase(directory_name) except: print link.get('href') traceback.print_exc() def updateDatabase(self, directory_name): # self.createDatabase.creatingDatabase() # self.createDatabase.addingData() self.createDatabase.addSingleBookData(directory_name) def isIsbnAvailableInDatabase(self, isbn_13=None): isBookPresent = False book = self.createDatabase.findByIsbn_13Name(isbn_13) if book: isBookPresent = True return isBookPresent def isBookNameAvailableInDatabase(self, bookName=None): isBookPresent = False book = self.createDatabase.findByBookName(bookName) if book: isBookPresent = True return isBookPresent def findBookDetail(self, subUrl): ''' This method will download book cover. It will provide book object. http://www.ebook777.com/shut-youre-welcome/ ''' book = None # url=self.baseUrl+'/'+subUrl url = subUrl r = requests.get(url, headers=self.header_info, timeout=30) if r.status_code == 200: soup = BeautifulSoup(r.content, "lxml") book = Book() book.bookDescription = soup.find(id="main-content-inner").p.text book.bookName = soup.find(id="main-content-inner").find( class_='article-details').find(class_='title').text book.subTitle = soup.find(id="main-content-inner").find( class_='article-details').find(class_='subtitle').text bookUrl = soup.find(id="main-content-inner").find( class_='download-links').find('a')['href'] table_body = soup.find('table') rows = table_body.find_all('tr') for row in rows: cols = row.find_all('td') if len(cols) == 3: book.bookImgName = cols[0].img.attrs['alt'] self.imageUrl = cols[0].img.attrs['src'] if cols[1].text == 'Author': # print cols[2].text author = Author() author.authorName = cols[2].text book.authors.append(author) # book.authors.append() if len(cols) == 2: if cols[0].text == 'File size': book.fileSize = cols[1].text if cols[0].text == 'Year': try: date = datetime.strptime(cols[1].text, '%Y') except: date = datetime.now() book.publishedOn = date if cols[0].text == 'Pages': book.numberOfPages = cols[1].text if cols[0].text == 'Language': book.inLanguage = cols[1].text if cols[0].text == 'File format': book.bookFormat = cols[1].text if cols[0].text == 'Category': book.tag = cols[1].text if cols[0].text == 'File format': book.bookFormat = cols[1].text if cols[0].text == 'Isbn': book.isbn_13 = cols[1].text # print cols return book, bookUrl def downloadEbook(self, book, refUrl, bookUrl): directory_name = self.downloadDir() url = refUrl bookImagePath = os.path.join(directory_name, book.bookImgName) self.downloadBookImage(bookImagePath, self.imageUrl) self.writeJsonToDir(directory_name, book) r = requests.get(bookUrl, headers=self.header_info, timeout=30) print '--------------->', r.url bookPath = os.path.join(directory_name, bookUrl.split('/')[-1]) with open(bookPath, 'wb') as bookFile: bookFile.write(r.content) try: self.extractRar(directory_name) except: traceback.print_exc() pass return directory_name def firefoxDownloadJob(self, book, refUrl): '''The function of this method is to download link of given URL.''' # Creating directory directory_name = self.downloadDir() # Creating Actual URL # url = self.baseUrl+refUrl url = refUrl lsFiles = [] # Checking if there are three files in this URL. # Creating a list of absolute files. if 3 == len(os.listdir(directory_name)): for sName in os.listdir(directory_name): if os.path.isfile(os.path.join(directory_name, sName)): lsFiles.append(sName) # Checking if there are more than 3 files in the directory location. # Removing all the files from direcotry. elif 3 != len(os.listdir(directory_name)): for sName in os.listdir(directory_name): os.remove(directory_name + '/' + sName) # Downloading book cover bookImagePath = os.path.join(directory_name, book.bookImgName) self.downloadBookImage(bookImagePath, self.imageUrl) # writing json file self.writeJsonToDir(directory_name, book) binary = FirefoxBinary('/docs/python_projects/firefox/firefox') fp = webdriver.FirefoxProfile() fp.set_preference("webdriver.log.file", "/tmp/firefox_console") fp.set_preference("browser.download.folderList", 2) fp.set_preference('browser.download.manager.showWhenStarting', False) fp.set_preference('browser.download.manager.focusWhenStarting', False) fp.set_preference("browser.download.dir", directory_name) fp.set_preference("browser.download.manager.scanWhenDone", False) fp.set_preference("browser.download.manager.useWindow", False) # fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream") fp.set_preference( "browser.helperApps.neverAsk.saveToDisk", "application/octet-stream,application/xml,application/pdf,text/plain,text/xml,image/jpeg,text/csv,application/zip,application/x-rar-compressed" ) fp.set_preference("browser.helperApps.alwaysAsk.force", False) fp.set_preference("browser.popups.showPopupBlocker", False) fp.update_preferences() driver = webdriver.Firefox(firefox_profile=fp, firefox_binary=binary) # driver.find_element_by_xpath("html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[1]/img") driver.get(url) efd_link = driver.find_element_by_css_selector( ".download-links > a:nth-child(1)") efd_link.click() # efd_link.send_keys(Keys.RETURN) flag = True while (flag): # # checking part file time.sleep(10) lst = [] files = [] for sName in os.listdir(directory_name): if os.path.isfile(os.path.join(directory_name, sName)): lst.append(sName.split('.')[-1:][0]) files.append(os.path.join(directory_name, sName)) # print lst if 'part' not in lst: flag = False time.sleep(10) driver.close() else: # print files # if not self.isBookDownloading(files): # driver.close() pass self.extractRar(directory_name) def downloadBookImage(self, bookImagePath=None, imageUrl=None): ''' this method will download image from imageUrl location and keep it at bookImagePath ''' from PIL import Image from StringIO import StringIO r = requests.get(imageUrl, headers=self.header_info, timeout=30) print '--------------->', r.url with open(bookImagePath, 'wb') as imageFile: imageFile.write(r.content) def writeJsonToDir(self, bookPath=None, book=None): ''' this function will write json file to given dir. ''' try: f = open(os.path.join(bookPath, 'book.json'), 'w') row2dict = book.__dict__ authors = [] if type(row2dict['publishedOn']) == datetime: row2dict['publishedOn'] = str(row2dict['publishedOn']) for a in row2dict['authors']: author = {} if type(a) == str: author['authorName'] = a else: author = a.__dict__ authors.append(author) row2dict['authors'] = authors if not row2dict['isbn_13'] == None: if str(row2dict['isbn_13']).strip() == '': row2dict['isbn_13'] = None f.write(json.dumps(row2dict, sort_keys=False, indent=4)) f.close() except: traceback.print_exc() def isBookDownloading(self, files): ''' This method will inform that book is getting downloading or not.''' # time.sleep(2) dic_files = {} time_dic_files = {} i = 1 checkFlagForSize = True isDownloading = True for fl in files: dic_files[fl] = str(os.stat(fl).st_size) while (checkFlagForSize): time_dic_files[i] = dic_files i = i + 1 if i > 4: size = set() for k in time_dic_files[i - 1]: if 'part' in k: size.add(time_dic_files[i - 1][k]) for k in time_dic_files[i - 2]: if 'part' in k: size.add(time_dic_files[i - 2][k]) for k in time_dic_files[i - 3]: if 'part' in k: size.add(time_dic_files[i - 3][k]) # print len(list(size)) if len(list(size)) > 1: isDownloading = False checkFlagForSize = False logging.info('isDownloading:') return isDownloading def startDownload(self): # baseUrl = 'http://itebooks.website' # baseUrl = 'http://it-ebooks.directory' baseUrl = 'http://www.ebook777.com' itebook = ItEbook(baseUrl) # TODO need to be updated logicTrue = True i = 1100 while logicTrue: subUrl = 'page/' + str(i) + '/' itebook.findAllBookUrl(subUrl) i = i + 1 print 'startDownload---------->', str(i) # if i==4: # break def getMaxBookID(self): maxBookId = self.createDatabase.getMaxBookID() if not maxBookId: maxBookId = 0 return maxBookId def downloadDir(self): ''' This function will create directory to download book. @param number:it takes database maxId+1 to create new directory . ''' directory_name = os.path.join(self.directory_name, str(self.getMaxBookID() + 1)) if not os.path.exists(directory_name): os.makedirs(directory_name) os.chdir(directory_name) return directory_name def extractRar(self, directory_name): ''' extracting rar file ''' os.chdir(directory_name) # directory_name = '/docs/new/library/8006' listOfFiles = [ name for name in os.listdir(directory_name) if not os.path.isdir(os.path.join(directory_name, name)) ] for fileName in listOfFiles: if fileName.endswith(".rar"): # print fileName directory_name rar = rarfile.RarFile(os.path.join(directory_name, fileName)) # print rar.namelist() infoList = rar.infolist() nameList = rar.namelist() for name in nameList: if not ((name.endswith('.html')) or (name.endswith('.htm')) or (name.endswith('.txt'))): rar.extract(name, directory_name) pass
class ItEbook(object): ''' This class downloads first page of itebookinfo ''' def __init__(self, baseUrl=None): ''' Constructor ''' self.baseUrl = baseUrl self.directory_name = Workspace().libraryPath self.createDatabase = CreateDatabase() self.header_info = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0' } pass def getUrl(self, baseUrl): '''this method will find and constuct all url of url given''' return self.baseUrl def findAllBookUrl(self, subUrl=None): ''' This method retrive all the book url avaialbe in the page. http://itebooks.website/page-2.html ''' url = self.baseUrl + '/' + subUrl print url # content = urllib2.urlopen(url).read() r = requests.get(url, headers=self.header_info, timeout=30) if r.status_code == 200: soup = BeautifulSoup(r.content, "lxml") skipList = (u'\nCategories', u'\nContact', u'\nUpload', u'\nDonate', u'IT eBooks', u'Prev', u'Next') listOfBookName = list() for link in soup.find_all('a'): if link.text.strip() != '' and link.text not in skipList: listOfBookName.append(link.text) isBookAvailable = self.isBookNameAvailableInDatabase( link.text) if not isBookAvailable: print link.text, '\t', link.get( 'href'), isBookAvailable book = self.findBookDetail(link.get('href')) # print book try: print 'uploading database' self.firefoxDownloadJob(book, link.get('href')) self.updateDatabase() except: print link.get('href') traceback.print_exc() def updateDatabase(self): self.createDatabase.creatingDatabase() self.createDatabase.addingData() def isBookNameAvailableInDatabase(self, bookName=None): isBookPresent = False book = self.createDatabase.findByBookName(bookName) if book: isBookPresent = True return isBookPresent def findBookDetail(self, subUrl): ''' This method will download book cover. It will provide book object.''' book = None url = self.baseUrl + '/' + subUrl r = requests.get(url, headers=self.header_info, timeout=30) if r.status_code == 200: soup = BeautifulSoup(r.content, "lxml") book = Book() book.authors.append( Author(soup.find_all(itemprop="author")[0].text)) book.isbn_10 = soup.find_all(itemprop="isbn")[0].text book.isbn_13 = soup.find_all(itemprop="isbn")[1].text book.bookName = soup.find_all(itemprop="name")[0].text book.publisher = soup.find_all(itemprop="publisher")[0].text try: date = datetime.strptime( str(soup.find_all(itemprop="datePublished")[0].text), '%Y') except: date = datetime.now() book.publishedOn = date book.numberOfPages = soup.find_all( itemprop="numberOfPages")[0].text book.inLanguage = soup.find_all(itemprop="inLanguage")[0].text book.bookFormat = soup.find_all(itemprop="bookFormat")[0].text book.bookDescription = soup.find_all("div", {"class": "span12"})[3].text print soup.find_all(itemprop="image") book.bookImgName = (soup.find_all(itemprop="image")[0]).get('src') try: book.subTitle = soup.find_all("div", {"class": "span12"})[1].text except: traceback.print_exc() # book.fileSize = soup.find_all('table')[3].find_all('tr')[7].find_all('td')[1].find_all('b')[0].text book.fileSize = soup.find_all( 'table', {"class": "table table-bordered" })[1].find_all('tr')[5].find_all('td')[1].text # book.fileSize= # .top > div:nth-child(2) > h3:nth-child(2) # for link in soup.find_all('a'): # if link.get('href').startswith('http://filepi.com'): # book.name = link.text # break return book def firefoxDownloadJob(self, book, refUrl): '''The function of this method is to download link of given URL.''' # Creating directory directory_name = self.downloadDir() # Creating Actual URL url = self.baseUrl + refUrl lsFiles = [] # Checking if there are three files in this URL. # Creating a list of absolute files. if 3 == len(os.listdir(directory_name)): for sName in os.listdir(directory_name): if os.path.isfile(os.path.join(directory_name, sName)): lsFiles.append(sName) # Checking if there are more than 3 files in the directory location. # Removing all the files from direcotry. elif 3 != len(os.listdir(directory_name)): for sName in os.listdir(directory_name): os.remove(directory_name + '/' + sName) imageUrl = self.baseUrl + book.bookImgName subUrl = book.bookImgName imageFileName = subUrl.split('/')[-1:][0] # Downloading book cover bookImagePath = os.path.join(directory_name, subUrl.split('/')[-1:][0]) # urllib.urlretrieve(imageUrl,bookImagePath) from PIL import Image from StringIO import StringIO r = requests.get(imageUrl, headers=self.header_info, timeout=30) print '--------------->', r.url with open(bookImagePath, 'wb') as imageFile: imageFile.write(r.content) book.bookImgName = imageFileName #writing json file self.writeJsonToDir(directory_name, book) fp = webdriver.FirefoxProfile() fp.set_preference("webdriver.log.file", "/tmp/firefox_console") fp.set_preference("browser.download.folderList", 2) fp.set_preference('browser.download.manager.showWhenStarting', True) fp.set_preference('browser.download.manager.focusWhenStarting', True) fp.set_preference("browser.download.dir", directory_name) fp.set_preference("browser.download.manager.scanWhenDone", True) fp.set_preference("browser.download.manager.useWindow", True) fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream") fp.update_preferences() driver = webdriver.Chrome() # driver.find_element_by_xpath("html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[1]/img") driver.get(url) efd_link = driver.find_element_by_id(id_='download') # efd_link.click() efd_link.send_keys(Keys.RETURN) flag = True while (flag): # # checking part file time.sleep(10) lst = [] files = [] for sName in os.listdir(directory_name): if os.path.isfile(os.path.join(directory_name, sName)): lst.append(sName.split('.')[-1:][0]) files.append(os.path.join(directory_name, sName)) print lst if 'part' not in lst: flag = False time.sleep(10) driver.close() else: #print files # if not self.isBookDownloading(files): # driver.close() pass def writeJsonToDir(self, bookPath=None, book=None): ''' this function will write json file to given dir. ''' try: f = open(os.path.join(bookPath, 'book.json'), 'w') row2dict = book.__dict__ authors = [] if type(row2dict['publishedOn']) == datetime: row2dict['publishedOn'] = str(row2dict['publishedOn']) for a in row2dict['authors']: author = {} if type(a) == str: author['authorName'] = a else: author = a.__dict__ authors.append(author) row2dict['authors'] = authors f.write(json.dumps(row2dict, sort_keys=False, indent=4)) f.close() except: traceback.print_exc() def isBookDownloading(self, files): ''' This method will inform that book is getting downloading or not.''' #time.sleep(2) dic_files = {} time_dic_files = {} i = 1 checkFlagForSize = True isDownloading = True for fl in files: dic_files[fl] = str(os.stat(fl).st_size) while (checkFlagForSize): time_dic_files[i] = dic_files i = i + 1 if i > 4: size = set() for k in time_dic_files[i - 1]: if 'part' in k: size.add(time_dic_files[i - 1][k]) for k in time_dic_files[i - 2]: if 'part' in k: size.add(time_dic_files[i - 2][k]) for k in time_dic_files[i - 3]: if 'part' in k: size.add(time_dic_files[i - 3][k]) # print len(list(size)) if len(list(size)) > 1: isDownloading = False checkFlagForSize = False logging.info('isDownloading:') return isDownloading def startDownload(self): baseUrl = 'http://itebooks.website' itebook = ItEbook(baseUrl) # TODO need to be updated logicTrue = True i = 2 while logicTrue: subUrl = 'page-' + str(i) + '.html' itebook.findAllBookUrl(subUrl) i = i + 1 print 'startDownload---------->', str(i) # if i==4: # break def getMaxBookID(self): maxBookId = self.createDatabase.getMaxBookID() if not maxBookId: maxBookId = 0 return maxBookId def downloadDir(self): ''' This function will create directory to download book. @param number:it takes database maxId+1 to create new directory . ''' directory_name = os.path.join(self.directory_name, str(self.getMaxBookID() + 1)) if not os.path.exists(directory_name): os.makedirs(directory_name) os.chdir(directory_name) return directory_name
class AddBook(): ''' This class have been written to add book to Opal workspace library. ''' def __init__(self, libraryPath=None): self.book = Book() self.book.uuid = str(uuid.uuid4()) self.book.tag = None self.book.authors = list() self.libraryPath = libraryPath self.createDatabase = CreateDatabase(libraryPath=libraryPath) def getMaxBookID(self): maxBookId = self.createDatabase.getMaxBookID() if maxBookId == None: maxBookId = 0 return maxBookId def addingBookToWorkspace(self, sourcePath=None, maxBookId=None): ''' This function will be creating a new dir. Get the max of id in Book table. Create the folder name with max of id plus one. @param sourcePath: This is the path of selected book. -1. Check if database present in workspace. There is possibility of a new workspace. 0. Check if book already present in workspace. 1. Create a folder with max_book_id+1 . 2. Copy the book file in the directory. 3. Create metadata i.e. (book.json) 4. Make an entry in database. ''' if sourcePath: # if maxBookId: # maxBookId = self.createDatabase.getMaxBookID() # # if maxBookId == None: # maxBookId = 0 # workspacePath = Workspace().libraryPath self.book.bookPath = os.path.join(self.libraryPath, str(maxBookId + 1)) head, tail = os.path.split(sourcePath) self.book.bookFileName = tail self.book.inLanguage = 'English' self.book.hasCover = 'Y' splited_name = tail.split(".") self.book.bookFormat = splited_name[-1:][0] splited_name.remove(self.book.bookFormat) book_file_name = '.'.join(splited_name) self.book.bookName = book_file_name self.book.wishListed = 'No' if not self.findingSameBook(): self.book.bookPath = os.path.join(self.libraryPath, str(maxBookId + 1)) if not os.path.exists(self.book.bookPath): os.makedirs(self.book.bookPath) dest = os.path.join(self.book.bookPath, tail) if sourcePath != dest: shutil.copy(sourcePath, dest) if 'pdf' == self.book.bookFormat: self.getPdfMetadata(sourcePath) if 'epub' == self.book.bookFormat: self.getEpubMetadata(sourcePath) pass os.chdir(self.book.bookPath) self.book.bookImgName = book_file_name + '.jpg' BookImage().getBookImage(self.book.bookPath, book_file_name, self.book.bookFormat) book_copy1 = copy.deepcopy(self.book) self.writeBookJson(self.book.bookPath, book_copy1) self.addingBookInfoInDatabase(self.book) def getImageFileName(self): imgFilePath = os.path.join(self.book.bookPath, self.book.bookImgName) if not os.path.exists(imgFilePath): directory = '.' pattern = re.compile(r"\-(\d*)\.jpg$") for file in os.listdir(directory): print(file) m = pattern.search(file) if m: # print(m.groups()) imgFilePath = m.group() bookImgName = self.currentBook.bookImgName.replace( '.jpg', m.group()) imgFilePath = os.path.join(self.currentBook.bookPath, bookImgName) return bookImgName def findingSameBook(self): ''' This method will allow you to find the same book available in workspace already. 1. check for same book name. 2. check for isbn. ''' logger.debug('findingSameBook') isSameBookPresent = False books = self.createDatabase.findBookByFileName(self.book.bookFileName) logger.debug('len(books): %s', len(books)) if len(books) > 0: isSameBookPresent = True return isSameBookPresent def addingBookInfoInDatabase(self, book): ''' This method will add new book info in database. ''' logger.debug('addingBookInfoInDatabase') self.createDatabase.saveBook(book) def writeBookJson(self, newDirPath=None, book=None): ''' This function will write book.json (metadata) of the newly added book in workspace. ''' logger.debug('writeBookJson newDirPath: %s', newDirPath) f = open(os.path.join(newDirPath, 'book.json'), 'w') row2dict = dict(book.__dict__) authors = [] try: for a in row2dict['authors']: author = {} if type(a) == str: author['authorName'] = a else: author = a.__dict__ if '_sa_instance_state' in author: del author['_sa_instance_state'] if 'book_assoc' in author: del author['book_assoc'] authors.append(author) if '_sa_instance_state' in row2dict: del row2dict['_sa_instance_state'] if 'authors' in row2dict: del row2dict['authors'] if 'book_assoc' in row2dict: del row2dict['book_assoc'] row2dict['authors'] = authors row2dict['publishedOn'] = str(datetime.now()) row2dict['createdOn'] = str(datetime.now()) except Exception as e: logger.error(e) # print newDirPath # print row2dict f.write(json.dumps(row2dict, sort_keys=True, indent=4)) f.close() def getEpubMetadata(self, path=None): logger.debug('getEpubMetadata') os.chdir(self.book.bookPath) file_name = self.book.bookName + '.epub' epubBook = EpubBook() epubBook.open(file_name) epubBook.parse_contents() authorList = list() for authorName in epubBook.get_authors(): author = Author() author.authorName = authorName author.aboutAuthor = 'aboutAuthor' authorList.append(author) self.book.authors = authorList self.book.tag = epubBook.subjectTag epubBook.extract_cover_image(outdir='.') self.book.createdOn = datetime.now() def getPdfMetadata(self, path=None): ''' This method will get the pdf metadata and return book object. ''' logger.debug('getPdfMetadata path: %s', path) if path: try: input = PdfFileReader(open(path, "rb")) logger.debug('getIsEncrypted : %s ', input.getIsEncrypted()) except Exception as e: logger.error(e, exc_info=True) pdf_info = None try: pdf_toread = PdfFileReader(open(path, "rb")) if pdf_toread.isEncrypted: try: pdf_toread.decrypt('') except Exception as e: logger.error(e, exc_info=True) except Exception as e: logger.error(e, exc_info=True) try: pdf_info = pdf_toread.getDocumentInfo() logger.debug('NumPages:%s', pdf_toread.getNumPages()) self.book.numberOfPages = pdf_toread.getNumPages() # value = pdf_info.subject subject = None if pdf_info.subject and type(pdf_info.subject) == str: # Ignore errors even if the string is not proper UTF-8 or has # broken marker bytes. # Python built-in function unicode() can do this. subject = pdf_info.subject # else: # # Assume the value object has proper __unicode__() method # value = unicode(pdf_info.subject) # print 'else' if not self.book.tag and subject: self.book.tag = subject elif self.book.tag and subject: self.book.tag = self.book.tag + '' + subject except Exception as e: logger.error(e, exc_info=True) try: if pdf_info.title != None and pdf_info.title.strip() != '': self.book.bookName = str(pdf_info.title) except Exception as e: logger.error(e, exc_info=True) try: if pdf_info.creator: self.book.publisher = str(pdf_info.creator.encode('utf-8')) except Exception as e: logger.error(e, exc_info=True) self.book.createdOn = datetime.now() try: # print str(pdf_info['/CreationDate'])[2:10] date = datetime.strptime( str(pdf_info['/CreationDate'])[2:10], '%Y%m%d') self.book.publishedOn = date except Exception as e: logger.error(e, exc_info=True) logger.error('CreationDate not found') logger.debug(Util().convert_bytes(os.path.getsize(path))) self.book.fileSize = Util().convert_bytes(os.path.getsize(path)) # if 'ISBN'.lower() in str(pdf_info['/Subject']).lower(): # self.book.isbn_13 = str(pdf_info['/Subject'])[6:] author = Author() val = 'Unknown' try: if pdf_info.author != None and pdf_info.author.strip() != '': val = pdf_info.author # val = val.encode("utf8", "ignore") except Exception as e: logger.error(e, exc_info=True) author.authorName = val authorList = list() authorList.append(author) self.book.authors = authorList
class PacktpubCrawl: def __init__(self): self.baseUrl = "https://www.packtpub.com/" self.directory_name = Workspace().libraryPath self.createDatabase = CreateDatabase() def findBookUrl(self): directory_name = '.' binary = FirefoxBinary('/docs/python_projects/firefox/firefox') fp = webdriver.FirefoxProfile() fp.set_preference("webdriver.log.file", "/tmp/firefox_console"); fp.set_preference("browser.download.folderList", 2) fp.set_preference('browser.download.manager.showWhenStarting', False) fp.set_preference('browser.download.manager.focusWhenStarting', False) fp.set_preference("browser.download.dir", directory_name) fp.set_preference("browser.download.manager.scanWhenDone", False) fp.set_preference("browser.download.manager.useWindow", False) # fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream") fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream,application/xml,application/pdf,text/plain,text/xml,image/jpeg,text/csv,application/zip,application/x-rar-compressed"); fp.set_preference("browser.helperApps.alwaysAsk.force", False); fp.set_preference("browser.popups.showPopupBlocker", False); fp.update_preferences() driver = webdriver.Firefox(firefox_profile=fp, firefox_binary=binary) # driver.find_element_by_xpath("html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[1]/img") driver.get(self.baseUrl) efd_link = driver.find_element_by_css_selector(".login-popup > div:nth-child(1)") efd_link.click() try: emailEl = driver.find_element_by_css_selector('#packt-user-login-form > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > input:nth-child(1)') # emailEl = driver.find_element_by_name("email") ''' Login with user credential ''' emailEl.send_keys('*****@*****.**') passwordEl = driver.find_element_by_css_selector("#packt-user-login-form > div:nth-child(1) > div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > input:nth-child(1)") passwordEl.send_keys('default') loginEl = driver.find_element_by_css_selector("#packt-user-login-form > div:nth-child(1) > div:nth-child(1) > div:nth-child(3) > input:nth-child(1)") loginEl.click() if True: ''' clicking on My Account ''' myAccountEl = driver.find_element_by_css_selector('#account-bar-logged-in > a:nth-child(1) > div:nth-child(1) > strong:nth-child(1)') myAccountEl.click() ''' clicking My ebooks ''' myEbook = driver.get(self.baseUrl + 'account/my-ebooks') productListEls = driver.find_elements_by_css_selector('div.product-line') print len(productListEls) bookList = list() for productEl in productListEls: print productEl try: bookName = productEl.find_element_by_css_selector('.title').text book = self.createBookDetail(bookName) productEl.click() readMeEl = productEl.find_element_by_css_selector('.fake-button-text') print 'new page', isbnEl = productEl.find_elements_by_css_selector('div > div:nth-child(2) > div:nth-child(1)> a:nth-child(1) > div:nth-child(1)') book.isbn_13 = isbnEl[0].get_attribute('isbn') # readMeEl.click() print 'div.product-line:nth-child(1) > div:nth-child(2) > div:nth-child(1) > a:nth-child(1) > div:nth-child(1)', # readMeEl.find_element_by_css_selector('h2.ng-binding') # # readingEl = driver.get('https://www.packtpub.com/mapt/book/All%20Books/' + book.isbn_13) # bookName1=driver.find_elements_by_css_selector('h2.ng-binding')[0].text bookList.append(book) except Exception as e: print e # product_account_list_el=driver.find_elements_by_css_selector('#product-account-list') driver.get('https://www.packtpub.com/packt/offers/free-learning') try: ''' clicking on Claim your free ebook ''' bookNameEl_1 = driver.find_element_by_css_selector('.dotd-title > h2:nth-child(1)') isBookAlreadyAvailable = False bookName_1 = bookNameEl_1.text for book in bookList: if bookName_1 in book.bookName: isBookAlreadyAvailable = True break if not isBookAlreadyAvailable: claimFreeEbookEl = driver.find_element_by_css_selector('.book-claim-token-inner > input:nth-child(3)') claimFreeEbookEl.click() except Exception as e: print e # myEbook.click() except Exception as e: print e finally: print 'completed' print 'hi' def createBookDetail(self, bookName=None): book = Book() book.bookName = bookName book.bookFormat = 'pdf' book.tag = 'Technology' book.inLanguage = 'English' book.subTitle = None book.publisher = "Packt Publishing Limited" book.bookImgName = bookName + '.jpg' book.hasCover = 'Yes' book.hasCode = None return book def getMaxBookID(self): ''' This function will get max book id. @param number:it takes database maxId+1 to create new directory . ''' maxBookId = self.createDatabase.getMaxBookID() if not maxBookId: maxBookId = 0 return maxBookId def downloadDir(self): ''' This function will create directory to download book. @param number:it takes database maxId+1 to create new directory . ''' directory_name = os.path.join(self.directory_name, str(self.getMaxBookID() + 1)) if not os.path.exists(directory_name): os.makedirs(directory_name) os.chdir(directory_name) return directory_name
class ItEbook(object): ''' This class downloads first page of itebookinfo ''' def __init__(self, baseUrl=None): ''' Constructor ''' self.baseUrl = baseUrl self.directory_name = Workspace().libraryPath self.createDatabase = CreateDatabase() pass def getUrl(self, baseUrl): '''this method will find and constuct all url of url given''' return self.baseUrl def findAllBookUrl(self): ''' This method retrive all the book url avaialbe in the page. ''' content = urllib2.urlopen(self.baseUrl).read() soup = BeautifulSoup(content, "lxml") skipList = ('IT eBooks', 'IT eBooks Group', u'IT-eBooks.Info', u'IT-eBooks API', u'IT-eBooks Search', 'Tweet') listOfBookName = list() for link in soup.find_all('a'): if link.text.strip() != '' and link.text not in skipList: listOfBookName.append(link.text) isBookAvailable = self.isBookNameAvailableInDatabase(link.text) if not isBookAvailable: print link.text, '\t', link.get('href'), isBookAvailable book = self.findBookDetail(link.get('href')) # print book try: self.firefoxDownloadJob(book, link.get('href')) self.updateDatabase() except: print link.get('href') traceback.print_exc() def updateDatabase(self): self.createDatabase.creatingDatabase() self.createDatabase.addingData() def isBookNameAvailableInDatabase(self, bookName=None): isBookPresent = False book = self.createDatabase.findByBookName(bookName) if book: isBookPresent = True return isBookPresent def findBookDetail(self, number): ''' This method will download book cover. It will provide book object.''' url = self.baseUrl + number content = urllib2.urlopen(url).read() soup = BeautifulSoup(content, "lxml") book = Book() book.authors.append(Author(soup.find_all(itemprop="author")[0].text)) book.isbn_13 = soup.find_all(itemprop="isbn")[0].text book.bookName = soup.find_all(itemprop="name")[0].text book.publisher = soup.find_all(itemprop="publisher")[0].text try: date = datetime.strptime( str(soup.find_all(itemprop="datePublished")[0].text), '%Y') except: date = datetime.now() book.publishedOn = date book.numberOfPages = soup.find_all(itemprop="numberOfPages")[0].text book.inLanguage = soup.find_all(itemprop="inLanguage")[0].text book.bookFormat = soup.find_all(itemprop="bookFormat")[0].text book.bookDescription = soup.find_all(itemprop="description")[0].text book.bookImgName = (soup.find_all(itemprop="image")[0]).get('src') try: book.subTitle = soup.h3.text except: traceback.print_exc() book.fileSize = soup.find_all('table')[3].find_all('tr')[7].find_all( 'td')[1].find_all('b')[0].text # book.fileSize= # .top > div:nth-child(2) > h3:nth-child(2) for link in soup.find_all('a'): if link.get('href').startswith('http://filepi.com'): book.name = link.text break return book def firefoxDownloadJob(self, book, refUrl): '''The function of this method is to download link of given URL.''' # Creating directory directory_name = self.downloadDir() # Creating Actual URL url = self.baseUrl + refUrl lsFiles = [] # Checking if there are three files in this URL. # Creating a list of absolute files. if 3 == len(os.listdir(directory_name)): for sName in os.listdir(directory_name): if os.path.isfile(os.path.join(directory_name, sName)): lsFiles.append(sName) # Checking if there are more than 3 files in the directory location. # Removing all the files from direcotry. elif 3 != len(os.listdir(directory_name)): for sName in os.listdir(directory_name): os.remove(directory_name + '/' + sName) imageUrl = self.baseUrl + book.bookImgName subUrl = book.bookImgName imageFileName = subUrl.split('/')[-1:][0] # Downloading book cover bookImagePath = os.path.join(directory_name, subUrl.split('/')[-1:][0]) urllib.urlretrieve(imageUrl, bookImagePath) book.bookImgName = imageFileName #writing json file self.writeJsonToDir(directory_name, book) binary = FirefoxBinary('/docs/python_projects/firefox/firefox') fp = webdriver.FirefoxProfile() fp.set_preference("webdriver.log.file", "/tmp/firefox_console") fp.set_preference("browser.download.folderList", 2) fp.set_preference('browser.download.manager.showWhenStarting', False) fp.set_preference('browser.download.manager.focusWhenStarting', False) fp.set_preference("browser.download.dir", directory_name) fp.set_preference("browser.download.manager.scanWhenDone", False) fp.set_preference("browser.download.manager.useWindow", False) fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream") fp.update_preferences() driver = webdriver.Firefox(firefox_profile=fp, firefox_binary=binary) # driver.find_element_by_xpath("html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[1]/img") driver.get(url) efd_link = driver.find_element_by_link_text(book.name) book.fileSize = driver.find_element_by_xpath( "html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[2]/table/tbody/tr[8]/td[2]/b" ).text book.bookFormat = driver.find_element_by_xpath( "html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[2]/table/tbody/tr[9]/td[2]/b" ).text efd_link.click() flag = True while (flag): # # checking part file time.sleep(10) lst = [] files = [] for sName in os.listdir(directory_name): if os.path.isfile(os.path.join(directory_name, sName)): lst.append(sName.split('.')[-1:][0]) files.append(os.path.join(directory_name, sName)) print lst if 'part' not in lst: flag = False time.sleep(10) driver.close() else: #print files # if not self.isBookDownloading(files): # driver.close() pass def writeJsonToDir(self, bookPath=None, book=None): ''' this function will write json file to given dir. ''' try: f = open(os.path.join(bookPath, 'book.json'), 'w') row2dict = book.__dict__ authors = [] if type(row2dict['publishedOn']) == datetime: row2dict['publishedOn'] = str(row2dict['publishedOn']) for a in row2dict['authors']: author = {} if type(a) == str: author['authorName'] = a else: author = a.__dict__ authors.append(author) row2dict['authors'] = authors f.write(json.dumps(row2dict, sort_keys=False, indent=4)) f.close() except: traceback.print_exc() def isBookDownloading(self, files): ''' This method will inform that book is getting downloading or not.''' #time.sleep(2) dic_files = {} time_dic_files = {} i = 1 checkFlagForSize = True isDownloading = True for fl in files: dic_files[fl] = str(os.stat(fl).st_size) while (checkFlagForSize): time_dic_files[i] = dic_files i = i + 1 if i > 4: size = set() for k in time_dic_files[i - 1]: if 'part' in k: size.add(time_dic_files[i - 1][k]) for k in time_dic_files[i - 2]: if 'part' in k: size.add(time_dic_files[i - 2][k]) for k in time_dic_files[i - 3]: if 'part' in k: size.add(time_dic_files[i - 3][k]) # print len(list(size)) if len(list(size)) > 1: isDownloading = False checkFlagForSize = False logging.info('isDownloading:') return isDownloading def startDownload(self): baseUrl = 'http://it-ebooks.info' itebook = ItEbook(baseUrl) # TODO need to be updated itebook.findAllBookUrl() def getMaxBookID(self): maxBookId = self.createDatabase.getMaxBookID() if not maxBookId: maxBookId = 0 return maxBookId def downloadDir(self): ''' This function will create directory to download book. @param number:it takes database maxId+1 to create new directory . ''' directory_name = os.path.join(self.directory_name, str(self.getMaxBookID() + 1)) if not os.path.exists(directory_name): os.makedirs(directory_name) os.chdir(directory_name) return directory_name
class FullCircleMagazine(): def __init__(self, baseUrl=None): self.baseUrl = baseUrl self.directory_name = Workspace().libraryPath self.createDatabase = CreateDatabase() self.header_info = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0'} # book image url self.imageUrl = None self.bookUrl = None pass def downloadFullCircleMagazine(self, url, book=None, bookUrl=None): ''' AQGPK3595C ''' # url = 'http://dl.fullcirclemagazine.org/issue1_en.pdf' # 'http://dl.fullcirclemagazine.org/issue3_en.pdf' directory_name = self.createDownloadDir() bookImagePath = os.path.join(directory_name, book.bookImgName) os.chdir(directory_name) r = requests.get(url, headers=self.header_info, timeout=30) if r.status_code == 200: print r.status_code, url print '------->', int(r.headers["content-length"]) / 1000000 book.fileSize = str(round(int(r.headers["content-length"]) / 1000000 , 2)) + ' MB' self.writeJsonToDir(directory_name, book) self.downloadBookImage(bookImagePath, self.imageUrl) # r = requests.get(bookUrl, headers=self.header_info, timeout=30) print '--------------->', r.url bookPath = os.path.join(directory_name, url.split('/')[-1]) print bookPath with open(bookPath, 'wb') as bookFile: bookFile.write(r.content) self.updateDatabase(directory_name) return r.status_code, directory_name def createBookDetail(self, bookName=None): book = Book() book.bookName = "Full Circle "+ bookName book.bookFormat = 'pdf' book.tag = 'Technology' book.inLanguage = 'English' book.subTitle = 'Magazine' book.publisher = "Full Circle" book.bookImgName = bookName + '.jpg' book.hasCover = 'Yes' book.hasCode = 'No' return book def writeJsonToDir(self, bookPath=None, book=None): try: f = open(os.path.join(bookPath, 'book.json'), 'w') row2dict = book.__dict__ authors = [] if type(row2dict['publishedOn']) == datetime: row2dict['publishedOn'] = str(row2dict['publishedOn']) for a in row2dict['authors']: author = {} if type(a) == str: author['authorName'] = a else: author = a.__dict__ authors.append(author) row2dict['authors'] = authors if not row2dict['isbn_13'] == None: if str(row2dict['isbn_13']).strip() == '': row2dict['isbn_13'] = None f.write(json.dumps(row2dict, sort_keys=False, indent=4)) f.close() except: traceback.print_exc() def downloadBookImage(self, bookImagePath=None, imageUrl=None): ''' this method will download image from imageUrl location and keep it at bookImagePath ''' print imageUrl head, data = imageUrl.split(',', 1) bits = head.split(';') mime_type = bits[0] if bits[0] else 'text/plain' charset, b64 = 'ASCII', False for bit in bits: if bit.startswith('charset='): charset = bit[8:] elif bit == 'base64': b64 = True # Do something smart with charset and b64 instead of assuming plaindata = data.decode("base64") # Do something smart with mime_type with open(bookImagePath, 'wb') as f: f.write(plaindata) print 'write image complete' # from PIL import Image # from StringIO import StringIO # r = requests.get(imageUrl, headers=self.header_info, timeout=30) # print '--------------->', r.url # with open(bookImagePath, 'wb') as imageFile: # imageFile.write(r.content) def updateDatabase(self, directory_name): # self.createDatabase.creatingDatabase() # self.createDatabase.addingData() self.createDatabase.addSingleBookData(directory_name) def isIsbnAvailableInDatabase(self, isbn_13=None): isBookPresent = False book = self.createDatabase.findByIsbn_13Name(isbn_13) if book: isBookPresent = True return isBookPresent def isBookNameAvailableInDatabase(self, bookName=None): isBookPresent = False book = self.createDatabase.findByBookName(bookName) if book: isBookPresent = True return isBookPresent def createDownloadDir(self): ''' This function will create directory to download book. @param number:it takes database maxId+1 to create new directory . ''' directory_name = os.path.join(self.directory_name, str(self.getMaxBookID() + 1)) if not os.path.exists(directory_name): os.makedirs(directory_name,755) os.chdir(directory_name) return directory_name def getMaxBookID(self): maxBookId = self.createDatabase.getMaxBookID() if not maxBookId: maxBookId = 0 return maxBookId def getImageUrl(self, completeUrl, issueCount): print completeUrl imageUrl = None r = requests.get(completeUrl, headers=self.header_info, timeout=30) if r.status_code == 200: soup = BeautifulSoup(r.content, "lxml") # print soup alt = soup.find(class_='issuetable').find('img')['alt'] if alt == 'Cover for Issue '+issueCount+' in English': imageUrl = soup.find(class_='issuetable').find('img')['src'] print imageUrl return imageUrl def startDownload(self): logic = True i = 1 while logic: pdfUrl = 'http://dl.fullcirclemagazine.org/issue' + str(i) + '_en.pdf' completeUrl = 'http://fullcirclemagazine.org/issue-' + str(i) + '/' if not self.isIssuePresent(str(i)): self.imageUrl = self.getImageUrl(completeUrl,str(i)) book = self.createBookDetail('Issue ' + str(i)) status_code, directory_name = self.downloadFullCircleMagazine(book=book, url=pdfUrl) print completeUrl, status_code if status_code != 200: logic = False i = i + 1 def isIssuePresent(self, issue=None): isBookPresent = False bookName="Full Circle Issue " + issue book = self.createDatabase.findByBookName(bookName) if book: isBookPresent = True return isBookPresent def getIssueDetail(self): url='https://wiki.ubuntu.com/UbuntuMagazine/FullIssueIndex' r = requests.get(url, headers=self.header_info, timeout=30) if r.status_code == 200: soup = BeautifulSoup(r.content, "lxml") tables=soup.findAll('table') for table in tables: print table