def SaveCaptcha(url): ipath = opt['prjPath'] + '/tmp/' ext = GetFileExtFromURL(url) filename = id_generator() + ext g = Grab(connect_timeout=5, userpwd='user:pass', debug_post='True', log_dir='log', headers={'Accept-Language': 'ru,en;q=0.8'}) try: g.download(url, ipath + filename) return ipath + filename except: return 'no image'
def SaveImage(url): ipath = opt['prjPath'] + '\img/' ext = GetFileExtFromURL(url) filename = id_generator() + ext g = Grab(connect_timeout=5, userpwd='user:pass', debug_post='True', log_dir='log', headers={'Accept-Language': 'ru,en;q=0.8'}) try: g.download(url, ipath + filename) print filename + " saved" return opt['imgServerPath'] + filename except: return 'no image'
class PacktPub(): def __init__(self): self.g = Grab() self.g.setup(follow_location=True) self.g.setup(follow_refresh=True) self.g.setup(timeout=120) self.g.setup(connect_timeout=10) #self.g.setup(body_maxsize=512000) self.logged_in = False def login(self, email, password): self.g.go('https://www.packtpub.com/') self.g.doc.save('/tmp/packtpub-home.html') self.g.doc.choose_form(id='packt-user-login-form') print("Logging in with account: {}".format(email)) self.g.doc.set_input('email', email) self.g.doc.set_input('password', password) self.g.doc.submit() self.g.doc.save('/tmp/packpub-home-after-login.html') self.g.doc.text_assert('"sid":') self.logged_in = True def get_ebooks_list(self, url="https://www.packtpub.com/account/my-ebooks"): '''Loads the list of purchased ebooks and returns a Selection object with all books.''' if url.startswith("http") and not self.logged_in: raise LoggedOutException("Must be logged in before getting ebooks list!") self.g.go(url) self.g.doc.save('/tmp/packtpub-my-ebooks.html') self.g.doc.text_assert('<h1>My eBooks </h1>') all_books_xsel = self.g.doc.select('//div[@id="product-account-list"]/div[starts-with(@class, "product-line")][@title]') all_books = [] for b in all_books_xsel: book_obj = PacktBook() book_obj.parse_from_xsel(b) all_books.append(book_obj) return all_books def download_book_all(self, book: PacktBook, destination_directory): '''Downloads all available files for given book to destination_directory/book_name.''' if not self.logged_in: raise LoggedOutException("Must be logged in before download!") base_name = book.get_safe_name() if not os.path.exists(destination_directory): os.makedirs(destination_directory, mode=0o775, exist_ok=True) print("Downloading PDF of {} from {}".format(base_name, book.dl_pdf)) self.g.download(book.dl_pdf, destination_directory + "/" + base_name + ".pdf")
desc = re.sub('<a>.+?</a>', '', desc) desc = re.sub('<[^>]*>', '', desc) desc = desc.decode('utf-8') else: desc = '' image = 'http://kampfer.ru' + ''.join(doc.xpath('//*[@id="img-current_picture"]/@src')) image_name = '%s.%s' %(number, image.split('.')[-1]) category = ''.join(doc.xpath('//li[@class="child current" or @class=" current"]/a/text()')).strip() count_tabs = len(doc.xpath('//li[@class="child current" or @class=" current"]/a/img')) if count_tabs >= 2 and category[0].upper() in string.ascii_uppercase: try: category = doc.xpath('//li[@class="child current" or @class=" current"]/preceding-sibling::li/a[count(img)=%s]/text()' %(count_tabs-1))[-1].strip() except: category = '' print '[ERROR] category' try: g.download(image, os.path.join('images', image_name)) except GrabNetworkError: print 'Fake download image' except IOError: print 'IOError' image_counter = count(1) image_number = image_counter.next() for extimageurl in doc.xpath('//div[@class="dopf"]//img/@src'): try: g.download('http://kampfer.ru' + extimageurl, os.path.join('images', '%s_%s.%s' %(number, image_number, extimageurl.split('.')[-1]))) image_number = image_counter.next() except GrabNetworkError: continue except IOError: print 'IOError' ws0.write(rownum, 0, number) ws0.write(rownum, 1, number) ws0.write(rownum, 2, number)