def main(): args = sys.argv[1:] crit = input.criteria() input.run(args, crit) storage = input.criteria() storage.data["department"] = "asd-CHEM" with requests.Session() as s: scraper.login(s) scraper.download(s.get(GOLD_SEARCH_URL), DEFAULT_GOLD_FILE_PATH, "search") scraper.post_search(crit, s, "chem3") html_extraction.parse_to_file("chem3", pretty=True)
def new_user_registration(): if request.method == 'GET': pk = request.args.get('key') if collection.count_documents({"_id": "W" + str(pk)}) > 0: form = RegisterForm(fb_id=pk) return render_template('register.html', form=form) else: return '404' else: fb_id = request.form.get('fb_id') gla_id = request.form.get('gla_id') gla_pass = request.form.get('gla_pass') loginResult = scraper.login(gla_id, gla_pass) if loginResult == 2: return '<h1> Wrong credentials. <a href="{}/register?key={}">Try again.</a></h1>'.format( app_url, fb_id) elif loginResult == 3: return '<h1> Something went wrong. <a href="{}/register?key={}">Try again.</a></h1>'.format( app_url, fb_id) collection.insert_one({ "_id": fb_id, "guid": gla_id, "thing": f.encrypt(gla_pass.encode()), "loggedIn": 1 }) collection.delete_one({"_id": "W" + fb_id}) return '<h1> Login successful! You can now close this page and chat to the bot. </h1>'
def register(): if request.method == 'GET': key = request.args.get('key') app.logger.info('uid:{} requested registration'.format(key)) if r.exists('IN_REG:' + key): app.logger.info('uid:{} is undergoing registration'.format(key)) form = RegisterForm(uid=key) return render_template('register.html', form=form) else: app.logger.info( 'uid:{} expired/invalid registration key'.format(key)) return '404' else: regno = request.form.get('regno') password = request.form.get('password') uid = request.form.get('uid') if scraper.login(regno, password) is None: app.logger.info('uid:{} provided wrong credentials'.format(uid)) return '<h1> Wrong credentials </h1>' app.logger.info('uid:{} has registered'.format(uid)) r.delete('IN_REG:' + uid) r.set(uid, json.dumps({'regno': regno, 'password': password})) return '<h1> Registration complete </h1>'
import urlparse import xbmcplugin import xbmcgui import xbmc addon_url = sys.argv[0] addon_handle = int(sys.argv[1]) args = urlparse.parse_qs(sys.argv[2][1:]) page = args.get("page", [None])[0] def build_url(query): return addon_url + '?' + urllib.urlencode(query) def get_videos(): videos = scraper.list_videos() for video in videos: url = build_url({"page": "resolve", "url": video["url"]}) li = xbmcgui.ListItem(video["label"], iconImage="DefaultVideo.png") xbmcplugin.addDirectoryItem(handle=addon_handle, url=url, listitem=li) xbmcplugin.endOfDirectory(addon_handle) if page is None: scraper.login(xbmcplugin.getSetting(addon_handle, 'username'), xbmcplugin.getSetting(addon_handle, 'password')) get_videos() elif page == "resolve": xbmc.Player().play(scraper.resolve_url(args.get("url")[0]))
def main(): parser = argparse.ArgumentParser( description="Scrape blinkist.com and generate pretty output") parser.add_argument( "--language", choices={"en", "de"}, default="en", help= "The language to scrape books in - either 'en' for english or 'de' for german", ) parser.add_argument( "--match-language", action="store_true", default=False, help= "Skip scraping books if not in the requested language (not all book are avaible in german)", ) def check_cooldown(value): if int(value) < 1: raise argparse.ArgumentTypeError("Can't be smaller than 1") return int(value) parser.add_argument( "--cooldown", type=check_cooldown, default=1, help= "Seconds to wait between scraping books, and downloading audio files. Can't be smaller than 1", ) parser.add_argument( "--headless", action="store_true", default=False, help= "Start the automated web browser in headless mode. Works only if you already logged in once", ) parser.add_argument( "--audio", action="store_true", default=False, help="Download the audio blinks for each book", ) parser.add_argument( "--concat-audio", action="store_true", default=False, help= "Concatenate the audio blinks into a single file and tag it. Requires ffmpeg", ) parser.add_argument( "--keep-noncat", action="store_true", default=False, help= "Keep the individual blink audio files, instead of deleting them (works with '--concat-audio' only)", ) parser.add_argument( "--no-scrape", action="store_true", default=False, help= "Don't scrape the website, only process existing json files in the dump folder. Do not provide email or password with this option.", ) parser.add_argument( "--book", default=False, help="Scrapes this book only, takes the blinkist url for the book" "(e.g. https://www.blinkist.com/en/books/... or https://www.blinkist.com/en/nc/reader/...)", ) parser.add_argument( "--daily-book", action="store_true", default=False, help="Scrapes the free daily book only.", ) parser.add_argument( "--books", default=False, help= "Scrapes the list of books, takes a txt file with the list of blinkist urls for the books" "(e.g. https://www.blinkist.com/en/books/... or https://www.blinkist.com/en/nc/reader/...)", ) parser.add_argument( "--book-category", default="Uncategorized", help= "When scraping a single book, categorize it under this category (works with '--book' only)", ) parser.add_argument( "--categories", type=str, nargs="+", default="", help= ("Only the categories whose label contains at least one string here will be scraped." "Case-insensitive; use spaces to separate categories. " "(e.g. '--categories entrep market' will only scrape books under 'Entrepreneurship' and 'Marketing & Sales')" ), ) parser.add_argument( "--ignore-categories", type=str, nargs="+", default="", help= ("If a category label contains anything in ignored_categories, books under that category will not be scraped. " "Case-insensitive; use spaces to separate categories. " "(e.g. '--ignored-categories entrep market' will skip scraping of 'Entrepreneurship' and 'Marketing & Sales')" ), ) parser.add_argument( "--create-html", action="store_true", default=True, help="Generate a formatted html document for the book", ) parser.add_argument( "--create-epub", action="store_true", default=True, help="Generate a formatted epub document for the book", ) parser.add_argument( "--create-pdf", action="store_true", default=False, help= "Generate a formatted pdf document for the book. Requires wkhtmltopdf", ) parser.add_argument( "--save-cover", action="store_true", default=False, help="Save a copy of the Blink cover artwork in the folder", ) parser.add_argument( "--embed-cover-art", action="store_true", default=False, help= "Embed the Blink cover artwork into the concatenated audio file (works with '--concat-audio' only)", ) parser.add_argument( "--chromedriver", help= "Path to a specific chromedriver executable instead of the built-in one", ) parser.add_argument("-v", "--verbose", action="store_true", help="Increases logging verbosity") if "--no-scrape" not in sys.argv: parser.add_argument( "email", help="The email to log into your premium Blinkist account") parser.add_argument( "password", help="The password to log into your premium Blinkist account") args = parser.parse_args() # set up logger verbosity logger.set_verbose(log, args.verbose) def generate_book_outputs(book_json, cover_img=False): if args.create_html: generator.generate_book_html(book_json, cover_img) if args.create_epub: generator.generate_book_epub(book_json) if args.create_pdf: generator.generate_book_pdf(book_json, cover_img) def scrape_book(driver, processed_books, book_url, category, match_language): book_json, dump_exists = scraper.scrape_book_data( driver, book_url, category=category, match_language=match_language) if book_json: cover_img_file = False cover_tmp_file = False if args.audio: audio_files = scraped_audio_exists(book_json) if not audio_files: audio_files = scraper.scrape_book_audio( driver, book_json, args.language) if audio_files and args.concat_audio: if type(audio_files) == list: if args.embed_cover_art: cover_tmp_file = scraper.download_book_cover_image( book_json, filename="_cover.jpg", alt_file="cover.jpg") generator.combine_audio(book_json, audio_files, args.keep_noncat, cover_tmp_file) if args.save_cover: cover_img_file = scraper.download_book_cover_image( book_json, filename="cover.jpg", alt_file="_cover.jpg") generate_book_outputs(book_json, cover_img=cover_img_file) else: generate_book_outputs(book_json) if cover_tmp_file: if os.path.exists(cover_tmp_file): log.debug(f"Deleting {cover_tmp_file}") os.remove(cover_tmp_file) else: log.debug(f'Could not find "{cover_tmp_file}"') processed_books.append(book_url) return dump_exists def finish(start_time, processed_books, driver=None): if driver: driver.close() elapsed_time = time.time() - start_time formatted_time = "{:02d}:{:02d}:{:02d}".format( int(elapsed_time // 3600), int(elapsed_time % 3600 // 60), int(elapsed_time % 60), ) total_books = len(processed_books) log.info( f"Processed {total_books} book{'s' if total_books != 1 else ''} in {formatted_time}" ) # start scraping log.info("Starting scrape run...") processed_books = [] start_time = time.time() if args.no_scrape: # if the --no-scrape argument is passed, just process the # existing json dump files for file in glob.glob(os.path.join(os.getcwd(), "dump", "*.json")): generate_book_outputs(file) processed_books.append(file) finish(start_time, processed_books) else: match_language = args.language if args.match_language else "" # if no login cookies were found, don't start a headless browser # so that the user can solve recaptcha and log in start_headless = args.headless if not scraper.has_login_cookies(): start_headless = False # add uBlock (if the conditions are right) use_ublock = not (args.book or args.headless) driver = scraper.initialize_driver( headless=start_headless, with_ublock=use_ublock, chromedriver_path=args.chromedriver, ) is_logged_in = scraper.login(driver, args.language, args.email, args.password) if is_logged_in: if args.book or args.daily_book: # scrape single book book_url = (args.book if not args.daily_book else scraper.get_daily_book_url(driver, args.language)) scrape_book( driver, processed_books, book_url, category={"label": args.book_category}, match_language=match_language, ) elif args.books: # scrape list of books with open(args.books, "r") as books_urls: for book_url in books_urls.readlines(): dump_exists = scrape_book( driver, processed_books, book_url.strip(), category={"label": args.book_category}, match_language=match_language, ) if not dump_exists: time.sleep(args.cooldown) else: # scrape all books / categories all_books = scraper.get_all_books(driver, args.language) categories = scraper.get_categories( driver, args.language, specified_categories=args.categories, ignored_categories=args.ignore_categories, ) for category in categories: books_urls = scraper.get_all_books_for_categories( driver, category) for book_url in books_urls: dump_exists = scrape_book( driver, processed_books, book_url, category=category, match_language=match_language, ) # if we processed the book from an existing dump # no scraping was involved, no need to cooldown if not dump_exists: time.sleep(args.cooldown) uncategorized_books = [ x for x in all_books if x not in processed_books ] log.info( f"Scraping {len(uncategorized_books)} remaining uncategorized books..." ) for book_url in uncategorized_books: dump_exists = scrape_book( driver, processed_books, book_url, category={"label": "Uncategorized"}, match_language=match_language, ) if not dump_exists: time.sleep(args.cooldown) else: log.error("Unable to login into Blinkist") finish(start_time, processed_books, driver)
start_time = time.time() try: if (args.no_scrape): # if the --no-scrape argument is passed, just process the existing json dump files for file in glob.glob(os.path.join("dump", "*.json")): process_book_json(file, processed_books) finish(None, start_time, processed_books) else: match_language = args.language if args.match_language else "" # if no login cookies were found, don't start a headless browser # so that the user can solve recaptcha and log in start_headless = args.headless if not scraper.has_login_cookies(): start_headless = False driver = scraper.initialize_driver(headless=start_headless) is_logged_in = scraper.login(driver, args.language, args.email, args.password) if (is_logged_in): if (args.book): scrape_book(driver, processed_books, args.book, category={"label": args.category}, match_language=match_language) else: categories = scraper.get_categories(driver, args.language) for category in categories: books_urls = scraper.get_all_books_for_categories( driver, category) for book_url in books_urls: dump_exists = scrape_book( driver,
print = logging.info logging.basicConfig(level=logging.WARNING if args.quiet else logging.INFO, format="%(message)s") def read_login(filename: str): username: str password: str with open(filename, 'r') as f: username = f.readline() password = f.readline() return (username, password) print('Inicialising scraper...') options = webdriver.FirefoxOptions() options.headless = args.quiet driver = webdriver.Firefox(options=options) loginData = read_login(args.filename) login(loginData[0], loginData[1], driver) tagDict = defaultdict(int) print('Scraping data...') for i in range(args.passes): print('Pass no. %d' % (i + 1)) scrape_once(driver, args.collect_dict, tagDict) for tag in sorted(tagDict.items(), key=operator.itemgetter(1), reverse=True): print(tag[0] + ' ' + str(tag[1])) driver.quit()
def parse_message(message, id): r = collection.find_one({"_id": id}) if r['loggedIn'] == 0: bot.send_text_message(id, "Logging in..") bot.send_action(id, "typing_on") loginResult = scraper.login(r['guid'], (f.decrypt(r['thing'])).decode()) if loginResult == 1: collection.update_one({"_id": id}, {'$set': {'loggedIn': 1}}) bot.send_text_message(id, "Logged in!") try: parse = witClient.message(message) bot.send_action(id, "typing_on") if 'datetime' in parse['entities']: return scraper.specific_day( parse['entities']['datetime'][0]['value'][:10], r['guid']) elif 'read_next' in parse['entities']: return scraper.read_now(r['guid']) else: return "What's up?" except: return "So, what's up?" else: collection.delete_one({"_id": id}) collection.insert_one({"_id": "W" + id}) return "Something went wrong.\nRegister here: {}/register?key={}".format( app_url, id) else: if scraper.check_browser(r['guid']): try: parse = witClient.message(message) bot.send_action(id, "typing_on") if 'logout' in parse['entities']: scraper.close(r['guid']) collection.update_one({"_id": id}, {'$set': { 'loggedIn': 0 }}) return "Logged out! Goodbye. :)" elif 'delete_data' in parse['entities']: scraper.close(r['guid']) collection.delete_one({"_id": id}) return "Deleted! :) " elif 'datetime' in parse['entities']: return scraper.specific_day( parse['entities']['datetime'][0]['value'][:10], r['guid']) elif 'read_next' in parse['entities']: return scraper.read_now(r['guid']) else: return "Not sure how to answer that." except: return "Something went wrong with parsing that." else: collection.update_one({"_id": id}, {'$set': {'loggedIn': 0}}) return "You have been logged out due to some error or being idle for too long. Say hello to log in again. :) "
def main(): parser = argparse.ArgumentParser(description="Scrape blinkist.com and generate pretty output") parser.add_argument("--language", choices={"en", "de"}, default="en", help="The language to scrape books in - either 'en' for english or 'de' for german") parser.add_argument("--match-language", action="store_true", default=False, help="Skip scraping books if not in the requested language (not all book are avaible in german)") def check_cooldown(value): if int(value) < 1: raise argparse.ArgumentTypeError("Can't be smaller than 1") return int(value) parser.add_argument("--cooldown", type=check_cooldown, default=1, help="Seconds to wait between scraping books, and downloading audio files. Can't be smaller than 1") parser.add_argument("--headless", action="store_true", default=False, help="Start the automated web browser in headless mode. Works only if you already logged in once") parser.add_argument("--audio", action="store_true", default=True, help="Download the audio blinks for each book") parser.add_argument("--concat-audio", action="store_true", default=False, help="Concatenate the audio blinks into a single file and tag it. Requires ffmpeg") parser.add_argument("--keep-noncat", action="store_true", default=False, help="Keep the individual blink audio files, instead of deleting them (works with '--concat-audio' only") parser.add_argument("--no-scrape", action="store_true", default=False, help="Don't scrape the website, only process existing json files in the dump folder. Do not provide email or password with this option.") parser.add_argument("--book", default=False, help="Scrapes this book only, takes the blinkist url for the book" "(e.g. https://www.blinkist.com/en/books/... or https://www.blinkist.com/en/nc/reader/...)") parser.add_argument("--books", default=False, help="Scrapes the list of books, takes a txt file with the list of blinkist urls for the books" "(e.g. https://www.blinkist.com/en/books/... or https://www.blinkist.com/en/nc/reader/...)") parser.add_argument("--book-category", default="Uncategorized", help="When scraping a single book, categorize it under this category (works with '--book' only)") parser.add_argument("--categories", type=str, nargs="+", default="", help=("Only the categories whose label contains at least one string here will be scraped." "Case-insensitive; use spaces to separate categories. " "(e.g. '--categories entrep market' will only scrape books under 'Entrepreneurship' and 'Marketing & Sales')")) parser.add_argument("--ignore-categories", type=str, nargs="+", default="", help=("If a category label contains anything in ignored_categories, books under that category will not be scraped. " "Case-insensitive; use spaces to separate categories. " "(e.g. '--ignored-categories entrep market' will skip scraping of 'Entrepreneurship' and 'Marketing & Sales')")) parser.add_argument("--create-html", action="store_true", default=True, help="Generate a formatted html document for the book") parser.add_argument("--create-epub", action="store_true", default=True, help="Generate a formatted epub document for the book") parser.add_argument("--create-pdf", action="store_true", default=False, help="Generate a formatted pdf document for the book. Requires wkhtmltopdf") parser.add_argument("--save-cover", action="store_true", default=False, help="Save a copy of the Blink cover artwork in the folder") parser.add_argument("--embed-cover-art", action="store_true", default=False, help="Embed the Blink cover artwork into the concatenated audio file (works with '--concat-audio' only)") parser.add_argument("--chromedriver", help='Path to a specific chromedriver executable instead of the built-in one') parser.add_argument("-v", "--verbose", action="store_true", help="Increases logging verbosity") if '--no-scrape' not in sys.argv: parser.add_argument("email", help="The email to log into your premium Blinkist account") parser.add_argument("password", help="The password to log into your premium Blinkist account") args = parser.parse_args() # set up logger log.setLevel(logging.INFO if not args.verbose else logging.DEBUG) log_screen_handler = logging.StreamHandler(stream=sys.stdout) log.addHandler(log_screen_handler) log.propagate = False try: import colorama, copy LOG_COLORS = { logging.DEBUG: colorama.Fore.GREEN, logging.INFO: colorama.Fore.BLUE, logging.WARNING: colorama.Fore.YELLOW, logging.ERROR: colorama.Fore.RED, logging.CRITICAL: colorama.Back.RED } class ColorFormatter(logging.Formatter): def format(self, record, *args, **kwargs): # if the corresponding logger has children, they may receive modified # record, so we want to keep it intact new_record = copy.copy(record) if new_record.levelno in LOG_COLORS: new_record.levelname = "{color_begin}{level}{color_end}".format( level=new_record.levelname, color_begin=LOG_COLORS[new_record.levelno], color_end=colorama.Style.RESET_ALL, ) return super(ColorFormatter, self).format(new_record, *args, **kwargs) log_screen_handler.setFormatter(ColorFormatter(fmt='%(asctime)s %(levelname)-8s %(message)s', datefmt="{color_begin}[%H:%M:%S]{color_end}".format( color_begin=colorama.Style.DIM, color_end=colorama.Style.RESET_ALL ))) except ModuleNotFoundError as identifier: pass def generate_book_outputs(book_json, cover_img=False): if (args.create_html): generator.generate_book_html(book_json, cover_img) if (args.create_epub): generator.generate_book_epub(book_json) if (args.create_pdf): generator.generate_book_pdf(book_json, cover_img) def scrape_book(driver, processed_books, book_url, category, match_language): book_json, dump_exists = scraper.scrape_book_data(driver, book_url, category=category, match_language=match_language) if (book_json): cover_img_file = False cover_tmp_file = False if (args.audio): audio_files = scraped_audio_exists(book_json) if (not audio_files): audio_files = scraper.scrape_book_audio(driver, book_json, args.language) if (audio_files and args.concat_audio): if (type(audio_files) == list): if (args.embed_cover_art): cover_tmp_file = scraper.download_book_cover_image(book_json, filename='_cover.jpg', alt_file='cover.jpg') generator.combine_audio(book_json, audio_files, args.keep_noncat, cover_tmp_file) if (args.save_cover): cover_img_file = scraper.download_book_cover_image(book_json, filename='cover.jpg', alt_file='_cover.jpg') generate_book_outputs(book_json, cover_img='cover.jpg') else: generate_book_outputs(book_json) if cover_tmp_file: if (os.path.exists(cover_tmp_file)): log.debug(f'Deleting {cover_tmp_file}') os.remove(cover_tmp_file) else: log.debug(f'Could not find "{cover_tmp_file}"') processed_books += 1 return dump_exists def finish(start_time, processed_books, driver = None): if (driver): driver.close() elapsed_time = time.time() - start_time formatted_time = '{:02d}:{:02d}:{:02d}'.format(int(elapsed_time // 3600), int(elapsed_time % 3600 // 60), int(elapsed_time % 60)) log.info(f"Processed {processed_books} books in {formatted_time}") # start scraping log.info('Starting scrape run...') processed_books = 0 start_time = time.time() if (args.no_scrape): # if the --no-scrape argument is passed, just process the existing json dump files for file in glob.glob(os.path.join(os.getcwd(), "dump", "*.json")): generate_book_outputs(file) processed_books += 1 finish(start_time, processed_books) else: match_language = args.language if args.match_language else "" # if no login cookies were found, don't start a headless browser # so that the user can solve recaptcha and log in start_headless = args.headless if not scraper.has_login_cookies(): start_headless = False # add uBlock (if the conditions are right) use_ublock = not (args.book or args.headless) driver = scraper.initialize_driver( headless=start_headless, with_ublock=use_ublock, chromedriver_path=args.chromedriver) is_logged_in = scraper.login(driver, args.language, args.email, args.password) if (is_logged_in): if (args.book): # scrape single book scrape_book( driver, processed_books, args.book, category={ "label" : args.book_category}, match_language=match_language) elif (args.books): # scrape list of books with open(args.books, 'r') as books_urls: for book_url in books_urls.readlines(): dump_exists = scrape_book( driver, processed_books, book_url.strip(), category={ "label" : args.book_category}, match_language=match_language) if not dump_exists: time.sleep(args.cooldown) else: # scrape all books categories = scraper.get_categories( driver, args.language, specified_categories=args.categories, ignored_categories=args.ignore_categories) for category in categories: books_urls = scraper.get_all_books_for_categories(driver, category) for book_url in books_urls: dump_exists = scrape_book(driver, processed_books, book_url, category=category, match_language=match_language) # if we processed the book from an existing dump # no scraping was involved, no need to cooldown if not dump_exists: time.sleep(args.cooldown) else: log.error("Unable to login into Blinkist") finish(start_time, processed_books, driver)