class SciHubAPI(QObject, threading.Thread): def __init__(self, query, callback=None, conf=None, log=None): QObject.__init__(self) threading.Thread.__init__(self) self._query = query self._callback = callback if conf: self._conf = conf else: self._conf = SciHubConf() if log: self.log = log self._sess = requests.Session() self._sess.headers = json.loads( self._conf.get('network', 'session_header')) retry_times = self._conf.getint('network', 'retry_times') retry = Retry(total=retry_times, read=retry_times, connect=retry_times) adapter = HTTPAdapter(max_retries=retry) self._sess.mount('http://', adapter) self._sess.mount('https://', adapter) self._set_http_proxy() self._doi_pattern = r'\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'])\S)+)\b' self._illegal_filename_pattern = r'[\/\\\:\*\?\"\<\>\|]' def log(self, message, type=None): if type: log_formater = '[{type}] - {message}' else: log_formater = '{message}' print(log_formater.format(type=type, message=message)) def _set_http_proxy(self): if self._conf.getboolean('proxy', 'enabled'): proxy_type = self._conf.get('proxy', 'type') proxy_host = self._conf.get('proxy', 'host') proxy_port = self._conf.get('proxy', 'port') proxy_username = self._conf.get('proxy', 'username') proxy_password = self._conf.get('proxy', 'password') proxy = proxy_type + '://' if proxy_username and proxy_username != '': proxy += proxy_username if proxy_password and proxy_password != '': proxy += proxy_password if proxy_username and proxy_username != '': proxy += '@' proxy += proxy_host if proxy_port and proxy_port != '': proxy += ':' + proxy_port self._sess.proxies = {'http': proxy, 'https': proxy} def get_pdf_metadata(self, pdf_file_stream): metadata = { 'author': 'UNKNOWN_AUTHOR', 'title': 'UNKNOWN_TITLE', 'year': 'UNKNOWN_YEAR' } pdf_parser = PDFParser(pdf_file_stream) pdf_doc = PDFDocument(pdf_parser) author = make_pdf_metadata_str(pdf_doc.info[0]['Author']) if author and author != '': metadata['author'] = author title = make_pdf_metadata_str(pdf_doc.info[0]['Title']) if title and title != '': metadata['title'] = title year = pdf_metadata_moddate_to_year( make_pdf_metadata_str(pdf_doc.info[0]['ModDate'])) if year and year != '': metadata['year'] = year return metadata def guess_query_type(self, query): if query.startswith('http') or query.startswith('https'): if query.endswith('pdf'): query_type = 'pdf' else: query_type = 'url' elif query.isdigit(): query_type = 'pmid' elif query.startswith('doi:') or re.match(self._doi_pattern, query): query_type = 'doi' else: query_type = 'unknown' log_formater = self.tr('Query type: ') + '{query_type}' self.log(log_formater.format(query_type=query_type.upper()), 'INFO') return query_type def fetch(self, query): query_type = self.guess_query_type(query) data = {} if query_type == 'unknown': data['error'] = self.tr('Unknown query type') return data current_scihub_url = self._conf.get('network', 'scihub_url') scihub_available_urls = json.loads( self._conf.get('network', 'scihub_available_urls')) current_scihub_url_index = scihub_available_urls.index( current_scihub_url) scihub_available_urls_ = scihub_available_urls[ current_scihub_url_index:] scihub_available_urls_.extend( scihub_available_urls[:current_scihub_url_index]) for round, scihub_url in enumerate(scihub_available_urls_): data = {} log_formater = self.tr('Using Sci-Hub URL: ') + '{scihub_url}' self.log(log_formater.format(scihub_url=scihub_url), 'INFO') self._conf.set('network', 'scihub_url', scihub_url) pdf_url = query if query_type != 'pdf': pdf_query_url = 'http://{scihub_url}/{query}'.format( scihub_url=scihub_url, query=query) try: self.log(self.tr('Fetching PDF URL ...'), 'INFO') pdf_url_response = self._sess.get( pdf_query_url, verify=False, timeout=self._conf.getfloat('network', 'timeout') / 1000.0) html = etree.HTML(pdf_url_response.content) iframes = html.xpath('//iframe') if len(iframes) > 0: iframe = iframes[0] pdf_url = iframe.attrib['src'] log_formater = self.tr('Got PDF URL: ') + '{pdf_url}' self.log(log_formater.format(pdf_url=pdf_url), 'INFO') else: data['error'] = self.tr('No valide iframe!') self.log(self.tr('Failed to get PDF URL!'), 'ERROR') self.log(data['error'], 'ERROR') except Exception as err: data['error'] = str(err) self.log(self.tr('Failed to get PDF!'), 'ERROR') self.log(data['error'], 'ERROR') if not 'error' in data: filename = urlparse(pdf_url).path[1:].split('/')[-1] data['filename'] = re.sub(self._illegal_filename_pattern, '_', filename) self.log(self.tr('Fetching PDF ...'), 'INFO') try: pdf_response = self._sess.get( pdf_url, verify=False, timeout=self._conf.getfloat('network', 'timeout') / 1000.0) if pdf_response.headers[ 'Content-Type'] == 'application/pdf': data['pdf'] = pdf_response.content temp_pdf_file = tempfile.TemporaryFile() temp_pdf_file.write(data['pdf']) pdf_metadata = self.get_pdf_metadata(temp_pdf_file) temp_pdf_file.close() data = dict(data, **pdf_metadata) else: data['error'] = self.tr('Unknown Content-Type') self.log(self.tr('Failed to get PDF!'), 'ERROR') self.log(data['error'], 'ERROR') except Exception as err: data['error'] = str(err) self.log(self.tr('Failed to get PDF!'), 'ERROR') self.log(data['error'], 'ERROR') if not 'error' in data: break else: if round == len(scihub_available_urls_) - 1: self.log(self.tr('Failed with all Sci-Hub URLs!'), 'ERROR') else: self.log(self.tr('Changing Sci-Hub URL ...'), 'INFO') return data def rampage(self, query): self.log('\n') log_formater = self.tr('Dealing with query: ') + '{query}' self.log(log_formater.format(query=query), 'INFO') data = self.fetch(query) if not 'error' in data: pdf_name_formater = self._conf.get( 'common', 'filename_prefix_format') + '_{filename}' pdf_name = pdf_name_formater.format(**data) pdf_path = os.path.join(self._conf.get('common', 'save_to_dir'), pdf_name) with open(pdf_path, 'wb') as fp: fp.write(data['pdf']) log_formater = self.tr('Saved PDF as: ') + '{pdf_name}' self.log(log_formater.format(pdf_name=pdf_name), 'INFO') def run(self): self.rampage(self._query) if self._callback: self._callback()
class SciHubEVA(QObject): beforeRampage = pyqtSignal() afterRampage = pyqtSignal() showErrorMessage = pyqtSignal(str, str) showInfoMessage = pyqtSignal(str, str) setSaveToDir = pyqtSignal(str) appendLogs = pyqtSignal(str) def __init__(self): super(SciHubEVA, self).__init__() self._conf = SciHubConf() self._engine = QQmlApplicationEngine() self._engine.load('qrc:/ui/SciHubEVA.qml') self._window = self._engine.rootObjects()[0] self._connect() self._scihub_preferences = SciHubPreferences(self._conf) save_to_dir = self._conf.get('common', 'save_to_dir') if not save_to_dir or save_to_dir.strip() == '': self._save_to_dir = None else: self._save_to_dir = save_to_dir self.setSaveToDir.emit(save_to_dir) def _connect(self): # Connect QML signals to PyQt slots self._window.saveToDir.connect(self.saveToDir) self._window.rampage.connect(self.rampage) self._window.showWindowPreference.connect(self.showWindowPreferences) # Connect PyQt signals to QML slots self.beforeRampage.connect(self._window.beforeRampage) self.afterRampage.connect(self._window.afterRampage) self.showErrorMessage.connect(self._window.showErrorMessage) self.showInfoMessage.connect(self._window.showInfoMessage) self.setSaveToDir.connect(self._window.setSaveToDir) self.appendLogs.connect(self._window.appendLogs) @property def conf(self): return self._conf @pyqtSlot(str) def saveToDir(self, directory): self._save_to_dir = directory self._conf.set('common', 'save_to_dir', directory) @pyqtSlot() def showWindowPreferences(self): self._scihub_preferences.loadFromConf() self._scihub_preferences.showWindowPreferences.emit() @pyqtSlot(str) def rampage(self, query): scihub_api = SciHubAPI(query, callback=self._afterRampage, conf=self._conf, log=self._log) self._beforeRampage() scihub_api.start() def _beforeRampage(self): self.beforeRampage.emit() def _afterRampage(self): self.afterRampage.emit() def _log(self, message, type = None): if type: log_formater = '[{type}] - {message}' else: log_formater = '{message}' self.appendLogs.emit(log_formater.format(type=type, message=message))
class SciHubEVA(QObject): beforeRampage = pyqtSignal() afterRampage = pyqtSignal() showErrorMessage = pyqtSignal(str, str) showInfoMessage = pyqtSignal(str, str) setSaveToDir = pyqtSignal(str) appendLogs = pyqtSignal(str) def __init__(self): super(SciHubEVA, self).__init__() self._conf = SciHubConf() self._engine = QQmlApplicationEngine() self._engine.load('qrc:/ui/SciHubEVA.qml') self._window = self._engine.rootObjects()[0] self._connect() self._scihub_preferences = SciHubPreferences(self._conf) self._scihub_captcha = SciHubCaptcha(self, log=self.log) self._captcha_query = None save_to_dir = self._conf.get('common', 'save_to_dir') if not save_to_dir or save_to_dir.strip() == '': self._save_to_dir = None else: self._save_to_dir = save_to_dir self.setSaveToDir.emit(save_to_dir) def _connect(self): # Connect QML signals to PyQt slots self._window.saveToDir.connect(self.saveToDir) self._window.rampage.connect(self.rampage) self._window.showWindowPreference.connect(self.showWindowPreferences) # Connect PyQt signals to QML slots self.beforeRampage.connect(self._window.beforeRampage) self.afterRampage.connect(self._window.afterRampage) self.showErrorMessage.connect(self._window.showErrorMessage) self.showInfoMessage.connect(self._window.showInfoMessage) self.setSaveToDir.connect(self._window.setSaveToDir) self.appendLogs.connect(self._window.appendLogs) @property def conf(self): return self._conf @pyqtSlot(str) def saveToDir(self, directory): self._save_to_dir = directory self._conf.set('common', 'save_to_dir', directory) @pyqtSlot() def showWindowPreferences(self): self._scihub_preferences.loadFromConf() self._scihub_preferences.showWindowPreferences.emit() @pyqtSlot(str) def rampage(self, input_query): """Download PDF with query of input Args: input_query: Query of input """ scihub_api = SciHubAPI(input_query, callback=self.rampage_callback, rampage_type=SciHubRampageType.INPUT, conf=self._conf, log=self.log) self.beforeRampage.emit() scihub_api.start() def rampageWithCaptchar(self, captcha_answer): """ Download PDF with captcha query (self._captcha_query) and captcha answer Args: captcha_answer: Captcha answer """ scihub_api = SciHubAPI( self._captcha_query, callback=self.rampage_callback, rampage_type=SciHubRampageType.PDF_CAPTCHA_RESPONSE, conf=self._conf, log=self.log, captcha_answer=captcha_answer) self.beforeRampage.emit() scihub_api.start() def rampage_callback(self, res, err): """Callback function Args: res: Result from last round rampage err: Error """ if err == SciHubError.BLOCKED_BY_CAPTCHA: self.captcha_callback(res) else: self.afterRampage.emit() def captcha_callback(self, pdf_captcha_response): """Callback function for PDF captcha response Args: pdf_captcha_response: PDF captcha response """ self._captcha_query = pdf_captcha_response _, captcha_img_url = SciHubAPI.get_captcha_info(pdf_captcha_response) self._scihub_captcha.showWindowCaptcha.emit(captcha_img_url) def log(self, message, level=None): if level: log_formater = '[{level}] - {message}' else: log_formater = '{message}' self.appendLogs.emit(log_formater.format(level=level, message=message))
class SciHubEVA(QObject): beforeRampage = Signal() afterRampage = Signal() loadSaveToDir = Signal(str) appendLogs = Signal(str, str) def __init__(self): super(SciHubEVA, self).__init__() self._conf = SciHubConf('SciHubEVA.conf') self._qt_quick_controls2_conf = SciHubConf( 'qtquickcontrols2.conf', space_around_delimiters=False) self._engine = QQmlApplicationEngine() self._engine.rootContext().setContextProperty( 'PYTHON_VERSION', '.'.join(str(v) for v in sys.version_info[:3])) self._engine.rootContext().setContextProperty( 'QT_VERSION', PySide2.QtCore.qVersion()) self._engine.load('qrc:/ui/SciHubEVA.qml') self._window = self._engine.rootObjects()[0] self._connect() self._scihub_preferences = SciHubPreferences( self._conf, self._qt_quick_controls2_conf) self._scihub_captcha = SciHubCaptcha(self, log=self.log) self._captcha_query = None save_to_dir = self._conf.get('common', 'save_to_dir') if not save_to_dir or save_to_dir.strip() == '': self._save_to_dir = None else: self._save_to_dir = save_to_dir self.loadSaveToDir.emit(save_to_dir) self._query_list = None self._query_list_length = 0 self._captcha_img_file = None def _connect(self): # Connect QML signals to PyQt slots self._window.setSaveToDir.connect(self.setSaveToDir) self._window.showSaveToDir.connect(self.showSaveToDir) self._window.rampage.connect(self.rampage) self._window.showWindowPreference.connect(self.showWindowPreference) # Connect PyQt signals to QML slots self.beforeRampage.connect(self._window.beforeRampage) self.afterRampage.connect(self._window.afterRampage) self.loadSaveToDir.connect(self._window.loadSaveToDir) self.appendLogs.connect(self._window.appendLogs) @property def conf(self): return self._conf @Slot(str) def setSaveToDir(self, directory): self._save_to_dir = directory self._conf.set('common', 'save_to_dir', directory) @Slot(str) def showSaveToDir(self, directory): if os.path.exists(directory): show_directory(directory) @Slot() def showWindowPreference(self): self._scihub_preferences.load_from_conf() self._scihub_preferences.showWindowPreferences.emit() @Slot(str) def rampage(self, input): """Download PDF with input Args: input: query or query list file path """ if os.path.exists(input): if is_text_file(input): self._query_list = deque() with open(input, 'rt') as f: for line in f: cleaned_line = line.strip() if cleaned_line != '': self._query_list.append(cleaned_line) self._query_list_length = len(self._query_list) self.rampage_query_list() else: self.log('<hr/>') self.log(self.tr('Query list file is not a text file!'), 'ERROR') elif is_range_query(input): self._query_list = deque(gen_range_query_list(input)) self._query_list_length = len(self._query_list) self.rampage_query_list() else: self.rampage_query(input) def rampage_query_list(self): """Download PDF with query list (self._query_list) """ if self._query_list and len(self._query_list) > 0: self.log('<hr/>') self.log( self.tr('Dealing with {}/{} query ...').format( self._query_list_length - len(self._query_list) + 1, self._query_list_length)) self.rampage_query(self._query_list.popleft()) def rampage_query(self, query): """Download PDF with query Args: query: Query of input """ scihub_api = SciHubAPI(query, callback=self.rampage_callback, rampage_type=SciHubRampageType.INPUT, conf=self._conf, log=self.log) self.beforeRampage.emit() scihub_api.start() def rampage_with_captcha(self, captcha_answer): """ Download PDF with captcha query (self._captcha_query) and captcha answer Args: captcha_answer: Captcha answer """ scihub_api = SciHubAPI( self._captcha_query, callback=self.rampage_callback, rampage_type=SciHubRampageType.PDF_CAPTCHA_RESPONSE, conf=self._conf, log=self.log, captcha_answer=captcha_answer) if self._captcha_img_file: self._captcha_img_file.close() self.beforeRampage.emit() scihub_api.start() def rampage_callback(self, res, err): """Callback function Args: res: Result from last round rampage err: Error """ if err == SciHubError.BLOCKED_BY_CAPTCHA: self.show_captcha(res) elif self._query_list: self.rampage_query_list() else: self.afterRampage.emit() def show_captcha(self, pdf_captcha_response): """Callback function for PDF captcha response Args: pdf_captcha_response: PDF captcha response """ self._captcha_query = pdf_captcha_response scihub_api = SciHubAPI('', log=self.log, conf=self._conf) _, captcha_img_url = scihub_api.get_captcha_info(pdf_captcha_response) self._captcha_img_file = scihub_api.download_captcha_img( captcha_img_url) captcha_img_local_url = 'file://' + self._captcha_img_file.name self._scihub_captcha.showWindowCaptcha.emit(captcha_img_local_url) def log(self, message, level=None): self.appendLogs.emit(message, level)
class SciHubEVA(QObject): beforeRampage = Signal() afterRampage = Signal() loadSaveToDir = Signal(str) appendLogs = Signal(str, str) def __init__(self): super(SciHubEVA, self).__init__() self._conf = SciHubConf('SciHubEVA.conf') self._qt_quick_controls2_conf = SciHubConf( 'qtquickcontrols2.conf', space_around_delimiters=False) self._engine = QQmlApplicationEngine() self._engine.rootContext().setContextProperty( 'PYTHON_VERSION', '.'.join(str(v) for v in sys.version_info[:3])) self._engine.rootContext().setContextProperty( 'QT_VERSION', PySide2.QtCore.qVersion()) self._engine.load('qrc:/ui/SciHubEVA.qml') self._window = self._engine.rootObjects()[0] self._connect() self._scihub_preferences = SciHubPreferences( self._conf, self._qt_quick_controls2_conf) self._scihub_captcha = SciHubCaptcha(self, log=self.log) self._captcha_query = None self._input = None save_to_dir = self._conf.get('common', 'save_to_dir') if not save_to_dir or save_to_dir.strip() == '': self._save_to_dir = None else: self._save_to_dir = save_to_dir self.loadSaveToDir.emit(save_to_dir) self._query_list = None self._query_list_length = 0 self._captcha_img_file_path = None self._logger = logging.getLogger('SciHubEVA') self._logger.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s - %(levelname)s - %(message)s') log_file_name_prefix = str(get_log_directory() / 'SciHubEVA.log') handler = TimedRotatingFileHandler(filename=log_file_name_prefix, when='D') handler.setFormatter(formatter) handler.setLevel(logging.DEBUG) self._logger.addHandler(handler) self._h2t = html2text.HTML2Text() self._h2t.ignore_links = True def _connect(self): # Connect QML signals to PyQt slots self._window.setSaveToDir.connect(self.setSaveToDir) self._window.openSaveToDir.connect(self.openSaveToDir) self._window.rampage.connect(self.rampage) self._window.showWindowPreference.connect(self.showWindowPreference) self._window.openLogFile.connect(self.openLogFile) self._window.openLogDirectory.connect(self.openLogDirectory) # Connect PyQt signals to QML slots self.beforeRampage.connect(self._window.beforeRampage) self.afterRampage.connect(self._window.afterRampage) self.loadSaveToDir.connect(self._window.loadSaveToDir) self.appendLogs.connect(self._window.appendLogs) @property def conf(self): return self._conf @Slot(str) def setSaveToDir(self, directory): self._save_to_dir = directory self._conf.set('common', 'save_to_dir', directory) @Slot(str) def openSaveToDir(self, directory): if os.path.exists(directory): open_directory(directory) @Slot() def showWindowPreference(self): self._scihub_preferences.load_from_conf() self._scihub_preferences.showWindowPreferences.emit() @Slot() def openLogFile(self): open_file(str(get_log_directory() / 'SciHubEVA.log')) @Slot() def openLogDirectory(self): open_directory(str(get_log_directory())) @Slot(str) def rampage(self, input): """Download PDF with input Args: input: query or query list file path """ self._input = input if os.path.exists(input): if is_text_file(input): self._query_list = deque() with open(input, 'rt') as f: for line in f: cleaned_line = line.strip() if cleaned_line != '': self._query_list.append(cleaned_line) self._query_list_length = len(self._query_list) self.rampage_query_list() else: self.log('<hr/>') self.log(self.tr('Query list file is not a text file!'), logging.ERROR) elif is_range_query(input): self._query_list = deque(gen_range_query_list(input)) self._query_list_length = len(self._query_list) self.rampage_query_list() else: self.rampage_query(input) def rampage_query_list(self): """Download PDF with query list (self._query_list) """ if self._query_list and len(self._query_list) > 0: self.log('<hr/>') self.log( self.tr('Dealing with {}/{} query ...').format( self._query_list_length - len(self._query_list) + 1, self._query_list_length)) self.rampage_query(self._query_list.popleft()) def rampage_query(self, query): """Download PDF with query Args: query: Query of input """ scihub_api = SciHubAPI(self._input, query, callback=self.rampage_callback, rampage_type=SciHubRampageType.INPUT, conf=self._conf, log=self.log) self.beforeRampage.emit() scihub_api.start() def rampage_with_captcha(self, captcha_answer): """ Download PDF with captcha query (self._captcha_query) and captcha answer Args: captcha_answer: Captcha answer """ if os.path.exists(self._captcha_img_file_path) and os.path.isfile( self._captcha_img_file_path): os.remove(self._captcha_img_file_path) scihub_api = SciHubAPI( self._input, self._captcha_query, callback=self.rampage_callback, rampage_type=SciHubRampageType.PDF_CAPTCHA_RESPONSE, conf=self._conf, log=self.log, captcha_answer=captcha_answer) self.beforeRampage.emit() scihub_api.start() def rampage_callback(self, res, err): """Callback function Args: res: Result from last round rampage err: Error """ if err == SciHubError.BLOCKED_BY_CAPTCHA: self.show_captcha(res) elif self._query_list: self.rampage_query_list() else: self.afterRampage.emit() def show_captcha(self, pdf_captcha_response): """Callback function for PDF captcha response Args: pdf_captcha_response: PDF captcha response """ self._captcha_query = pdf_captcha_response scihub_api = SciHubAPI(self._input, None, log=self.log, conf=self._conf) _, captcha_img_url = scihub_api.get_captcha_info(pdf_captcha_response) captcha_img_file = scihub_api.download_captcha_img(captcha_img_url) self._captcha_img_file_path = Path(captcha_img_file.name).as_posix() captcha_img_local_uri = Path(captcha_img_file.name).as_uri() self._scihub_captcha.showWindowCaptcha.emit(captcha_img_local_uri) def log(self, message: str, level=None): self.appendLogs.emit(message, logging.getLevelName(level) if level else '') text_message = self._h2t.handle(message).strip() if text_message and text_message != '': self._logger.log(level if level else logging.INFO, text_message)