def __init__(self, query, callback=None, conf=None, log=None): QObject.__init__(self) threading.Thread.__init__(self) self._query = query self._callback = callback if conf: self._conf = conf else: self._conf = SciHubConf() if log: self.log = log self._sess = requests.Session() self._sess.headers = json.loads( self._conf.get('network', 'session_header')) retry_times = self._conf.getint('network', 'retry_times') retry = Retry(total=retry_times, read=retry_times, connect=retry_times) adapter = HTTPAdapter(max_retries=retry) self._sess.mount('http://', adapter) self._sess.mount('https://', adapter) self._set_http_proxy() self._doi_pattern = r'\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'])\S)+)\b' self._illegal_filename_pattern = r'[\/\\\:\*\?\"\<\>\|]'
def __init__(self): super(SciHubEVA, self).__init__() self._conf = SciHubConf('SciHubEVA.conf') self._qt_quick_controls2_conf = SciHubConf( 'qtquickcontrols2.conf', space_around_delimiters=False) self._engine = QQmlApplicationEngine() self._engine.rootContext().setContextProperty( 'PYTHON_VERSION', '.'.join(str(v) for v in sys.version_info[:3])) self._engine.rootContext().setContextProperty( 'QT_VERSION', PySide2.QtCore.qVersion()) self._engine.load('qrc:/ui/SciHubEVA.qml') self._window = self._engine.rootObjects()[0] self._connect() self._scihub_preferences = SciHubPreferences( self._conf, self._qt_quick_controls2_conf) self._scihub_captcha = SciHubCaptcha(self, log=self.log) self._captcha_query = None save_to_dir = self._conf.get('common', 'save_to_dir') if not save_to_dir or save_to_dir.strip() == '': self._save_to_dir = None else: self._save_to_dir = save_to_dir self.loadSaveToDir.emit(save_to_dir) self._query_list = None self._query_list_length = 0 self._captcha_img_file = None
def __init__(self, query, callback=None, rampage_type=None, conf=None, log=None, **kwargs): QObject.__init__(self) threading.Thread.__init__(self) self._query = query self._callback = callback self._rampage_type = rampage_type # Captcha answer, used only when rampage_type == SciHubRampageType.PDF_CAPTCHA_RESPONSE if 'captcha_answer' in kwargs: self._captcha_answer = kwargs['captcha_answer'] if conf: self._conf = conf else: self._conf = SciHubConf() if log: self.log = log self._sess = requests.Session() self._sess.headers = json.loads(self._conf.get('network', 'session_header')) retry_times = self._conf.getint('network', 'retry_times') retry = Retry(total=retry_times, read=retry_times, connect=retry_times) adapter = HTTPAdapter(max_retries=retry) self._sess.mount('http://', adapter) self._sess.mount('https://', adapter) self._set_http_proxy() self._doi_pattern = r'\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'])\S)+)\b' self._illegal_filename_pattern = r'[\/\\\:\*\?\"\<\>\|]'
def __init__(self): super(SciHubEVA, self).__init__() self._conf = SciHubConf('SciHubEVA.conf') self._qt_quick_controls2_conf = SciHubConf( 'qtquickcontrols2.conf', space_around_delimiters=False) self._engine = QQmlApplicationEngine() self._engine.rootContext().setContextProperty( 'PYTHON_VERSION', '.'.join(str(v) for v in sys.version_info[:3])) self._engine.rootContext().setContextProperty( 'QT_VERSION', PySide2.QtCore.qVersion()) self._engine.load('qrc:/ui/SciHubEVA.qml') self._window = self._engine.rootObjects()[0] self._connect() self._scihub_preferences = SciHubPreferences( self._conf, self._qt_quick_controls2_conf) self._scihub_captcha = SciHubCaptcha(self, log=self.log) self._captcha_query = None self._input = None save_to_dir = self._conf.get('common', 'save_to_dir') if not save_to_dir or save_to_dir.strip() == '': self._save_to_dir = None else: self._save_to_dir = save_to_dir self.loadSaveToDir.emit(save_to_dir) self._query_list = None self._query_list_length = 0 self._captcha_img_file_path = None self._logger = logging.getLogger('SciHubEVA') self._logger.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s - %(levelname)s - %(message)s') log_file_name_prefix = str(get_log_directory() / 'SciHubEVA.log') handler = TimedRotatingFileHandler(filename=log_file_name_prefix, when='D') handler.setFormatter(formatter) handler.setLevel(logging.DEBUG) self._logger.addHandler(handler) self._h2t = html2text.HTML2Text() self._h2t.ignore_links = True
def __init__(self): super(SciHubEVA, self).__init__() self._conf = SciHubConf() self._engine = QQmlApplicationEngine() self._engine.load('qrc:/ui/SciHubEVA.qml') self._window = self._engine.rootObjects()[0] self._connect() self._scihub_preferences = SciHubPreferences(self._conf) save_to_dir = self._conf.get('common', 'save_to_dir') if not save_to_dir or save_to_dir.strip() == '': self._save_to_dir = None else: self._save_to_dir = save_to_dir self.setSaveToDir.emit(save_to_dir)
def __init__(self): super(SciHubEVA, self).__init__() self._conf = SciHubConf('SciHubEVA.conf') self._qt_quick_controls2_conf = SciHubConf( 'qtquickcontrols2.conf', space_around_delimiters=False) self._engine = QQmlApplicationEngine() self._engine.load('qrc:/ui/SciHubEVA.qml') self._window = self._engine.rootObjects()[0] self._connect() self._scihub_preferences = SciHubPreferences( self._conf, self._qt_quick_controls2_conf) self._scihub_captcha = SciHubCaptcha(self, log=self.log) self._captcha_query = None save_to_dir = self._conf.get('common', 'save_to_dir') if not save_to_dir or save_to_dir.strip() == '': self._save_to_dir = None else: self._save_to_dir = save_to_dir self.setSaveToDir.emit(save_to_dir)
class SciHubAPI(QObject, threading.Thread): def __init__(self, query, callback=None, conf=None, log=None): QObject.__init__(self) threading.Thread.__init__(self) self._query = query self._callback = callback if conf: self._conf = conf else: self._conf = SciHubConf() if log: self.log = log self._sess = requests.Session() self._sess.headers = json.loads( self._conf.get('network', 'session_header')) retry_times = self._conf.getint('network', 'retry_times') retry = Retry(total=retry_times, read=retry_times, connect=retry_times) adapter = HTTPAdapter(max_retries=retry) self._sess.mount('http://', adapter) self._sess.mount('https://', adapter) self._set_http_proxy() self._doi_pattern = r'\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'])\S)+)\b' self._illegal_filename_pattern = r'[\/\\\:\*\?\"\<\>\|]' def log(self, message, type=None): if type: log_formater = '[{type}] - {message}' else: log_formater = '{message}' print(log_formater.format(type=type, message=message)) def _set_http_proxy(self): if self._conf.getboolean('proxy', 'enabled'): proxy_type = self._conf.get('proxy', 'type') proxy_host = self._conf.get('proxy', 'host') proxy_port = self._conf.get('proxy', 'port') proxy_username = self._conf.get('proxy', 'username') proxy_password = self._conf.get('proxy', 'password') proxy = proxy_type + '://' if proxy_username and proxy_username != '': proxy += proxy_username if proxy_password and proxy_password != '': proxy += proxy_password if proxy_username and proxy_username != '': proxy += '@' proxy += proxy_host if proxy_port and proxy_port != '': proxy += ':' + proxy_port self._sess.proxies = {'http': proxy, 'https': proxy} def get_pdf_metadata(self, pdf_file_stream): metadata = { 'author': 'UNKNOWN_AUTHOR', 'title': 'UNKNOWN_TITLE', 'year': 'UNKNOWN_YEAR' } pdf_parser = PDFParser(pdf_file_stream) pdf_doc = PDFDocument(pdf_parser) author = make_pdf_metadata_str(pdf_doc.info[0]['Author']) if author and author != '': metadata['author'] = author title = make_pdf_metadata_str(pdf_doc.info[0]['Title']) if title and title != '': metadata['title'] = title year = pdf_metadata_moddate_to_year( make_pdf_metadata_str(pdf_doc.info[0]['ModDate'])) if year and year != '': metadata['year'] = year return metadata def guess_query_type(self, query): if query.startswith('http') or query.startswith('https'): if query.endswith('pdf'): query_type = 'pdf' else: query_type = 'url' elif query.isdigit(): query_type = 'pmid' elif query.startswith('doi:') or re.match(self._doi_pattern, query): query_type = 'doi' else: query_type = 'unknown' log_formater = self.tr('Query type: ') + '{query_type}' self.log(log_formater.format(query_type=query_type.upper()), 'INFO') return query_type def fetch(self, query): query_type = self.guess_query_type(query) data = {} if query_type == 'unknown': data['error'] = self.tr('Unknown query type') return data current_scihub_url = self._conf.get('network', 'scihub_url') scihub_available_urls = json.loads( self._conf.get('network', 'scihub_available_urls')) current_scihub_url_index = scihub_available_urls.index( current_scihub_url) scihub_available_urls_ = scihub_available_urls[ current_scihub_url_index:] scihub_available_urls_.extend( scihub_available_urls[:current_scihub_url_index]) for round, scihub_url in enumerate(scihub_available_urls_): data = {} log_formater = self.tr('Using Sci-Hub URL: ') + '{scihub_url}' self.log(log_formater.format(scihub_url=scihub_url), 'INFO') self._conf.set('network', 'scihub_url', scihub_url) pdf_url = query if query_type != 'pdf': pdf_query_url = 'http://{scihub_url}/{query}'.format( scihub_url=scihub_url, query=query) try: self.log(self.tr('Fetching PDF URL ...'), 'INFO') pdf_url_response = self._sess.get( pdf_query_url, verify=False, timeout=self._conf.getfloat('network', 'timeout') / 1000.0) html = etree.HTML(pdf_url_response.content) iframes = html.xpath('//iframe') if len(iframes) > 0: iframe = iframes[0] pdf_url = iframe.attrib['src'] log_formater = self.tr('Got PDF URL: ') + '{pdf_url}' self.log(log_formater.format(pdf_url=pdf_url), 'INFO') else: data['error'] = self.tr('No valide iframe!') self.log(self.tr('Failed to get PDF URL!'), 'ERROR') self.log(data['error'], 'ERROR') except Exception as err: data['error'] = str(err) self.log(self.tr('Failed to get PDF!'), 'ERROR') self.log(data['error'], 'ERROR') if not 'error' in data: filename = urlparse(pdf_url).path[1:].split('/')[-1] data['filename'] = re.sub(self._illegal_filename_pattern, '_', filename) self.log(self.tr('Fetching PDF ...'), 'INFO') try: pdf_response = self._sess.get( pdf_url, verify=False, timeout=self._conf.getfloat('network', 'timeout') / 1000.0) if pdf_response.headers[ 'Content-Type'] == 'application/pdf': data['pdf'] = pdf_response.content temp_pdf_file = tempfile.TemporaryFile() temp_pdf_file.write(data['pdf']) pdf_metadata = self.get_pdf_metadata(temp_pdf_file) temp_pdf_file.close() data = dict(data, **pdf_metadata) else: data['error'] = self.tr('Unknown Content-Type') self.log(self.tr('Failed to get PDF!'), 'ERROR') self.log(data['error'], 'ERROR') except Exception as err: data['error'] = str(err) self.log(self.tr('Failed to get PDF!'), 'ERROR') self.log(data['error'], 'ERROR') if not 'error' in data: break else: if round == len(scihub_available_urls_) - 1: self.log(self.tr('Failed with all Sci-Hub URLs!'), 'ERROR') else: self.log(self.tr('Changing Sci-Hub URL ...'), 'INFO') return data def rampage(self, query): self.log('\n') log_formater = self.tr('Dealing with query: ') + '{query}' self.log(log_formater.format(query=query), 'INFO') data = self.fetch(query) if not 'error' in data: pdf_name_formater = self._conf.get( 'common', 'filename_prefix_format') + '_{filename}' pdf_name = pdf_name_formater.format(**data) pdf_path = os.path.join(self._conf.get('common', 'save_to_dir'), pdf_name) with open(pdf_path, 'wb') as fp: fp.write(data['pdf']) log_formater = self.tr('Saved PDF as: ') + '{pdf_name}' self.log(log_formater.format(pdf_name=pdf_name), 'INFO') def run(self): self.rampage(self._query) if self._callback: self._callback()
class SciHubEVA(QObject): beforeRampage = pyqtSignal() afterRampage = pyqtSignal() showErrorMessage = pyqtSignal(str, str) showInfoMessage = pyqtSignal(str, str) setSaveToDir = pyqtSignal(str) appendLogs = pyqtSignal(str) def __init__(self): super(SciHubEVA, self).__init__() self._conf = SciHubConf() self._engine = QQmlApplicationEngine() self._engine.load('qrc:/ui/SciHubEVA.qml') self._window = self._engine.rootObjects()[0] self._connect() self._scihub_preferences = SciHubPreferences(self._conf) save_to_dir = self._conf.get('common', 'save_to_dir') if not save_to_dir or save_to_dir.strip() == '': self._save_to_dir = None else: self._save_to_dir = save_to_dir self.setSaveToDir.emit(save_to_dir) def _connect(self): # Connect QML signals to PyQt slots self._window.saveToDir.connect(self.saveToDir) self._window.rampage.connect(self.rampage) self._window.showWindowPreference.connect(self.showWindowPreferences) # Connect PyQt signals to QML slots self.beforeRampage.connect(self._window.beforeRampage) self.afterRampage.connect(self._window.afterRampage) self.showErrorMessage.connect(self._window.showErrorMessage) self.showInfoMessage.connect(self._window.showInfoMessage) self.setSaveToDir.connect(self._window.setSaveToDir) self.appendLogs.connect(self._window.appendLogs) @property def conf(self): return self._conf @pyqtSlot(str) def saveToDir(self, directory): self._save_to_dir = directory self._conf.set('common', 'save_to_dir', directory) @pyqtSlot() def showWindowPreferences(self): self._scihub_preferences.loadFromConf() self._scihub_preferences.showWindowPreferences.emit() @pyqtSlot(str) def rampage(self, query): scihub_api = SciHubAPI(query, callback=self._afterRampage, conf=self._conf, log=self._log) self._beforeRampage() scihub_api.start() def _beforeRampage(self): self.beforeRampage.emit() def _afterRampage(self): self.afterRampage.emit() def _log(self, message, type = None): if type: log_formater = '[{type}] - {message}' else: log_formater = '{message}' self.appendLogs.emit(log_formater.format(type=type, message=message))
class SciHubEVA(QObject): beforeRampage = Signal() afterRampage = Signal() loadSaveToDir = Signal(str) appendLogs = Signal(str, str) def __init__(self): super(SciHubEVA, self).__init__() self._conf = SciHubConf('SciHubEVA.conf') self._qt_quick_controls2_conf = SciHubConf( 'qtquickcontrols2.conf', space_around_delimiters=False) self._engine = QQmlApplicationEngine() self._engine.rootContext().setContextProperty( 'PYTHON_VERSION', '.'.join(str(v) for v in sys.version_info[:3])) self._engine.rootContext().setContextProperty( 'QT_VERSION', PySide2.QtCore.qVersion()) self._engine.load('qrc:/ui/SciHubEVA.qml') self._window = self._engine.rootObjects()[0] self._connect() self._scihub_preferences = SciHubPreferences( self._conf, self._qt_quick_controls2_conf) self._scihub_captcha = SciHubCaptcha(self, log=self.log) self._captcha_query = None save_to_dir = self._conf.get('common', 'save_to_dir') if not save_to_dir or save_to_dir.strip() == '': self._save_to_dir = None else: self._save_to_dir = save_to_dir self.loadSaveToDir.emit(save_to_dir) self._query_list = None self._query_list_length = 0 self._captcha_img_file = None def _connect(self): # Connect QML signals to PyQt slots self._window.setSaveToDir.connect(self.setSaveToDir) self._window.showSaveToDir.connect(self.showSaveToDir) self._window.rampage.connect(self.rampage) self._window.showWindowPreference.connect(self.showWindowPreference) # Connect PyQt signals to QML slots self.beforeRampage.connect(self._window.beforeRampage) self.afterRampage.connect(self._window.afterRampage) self.loadSaveToDir.connect(self._window.loadSaveToDir) self.appendLogs.connect(self._window.appendLogs) @property def conf(self): return self._conf @Slot(str) def setSaveToDir(self, directory): self._save_to_dir = directory self._conf.set('common', 'save_to_dir', directory) @Slot(str) def showSaveToDir(self, directory): if os.path.exists(directory): show_directory(directory) @Slot() def showWindowPreference(self): self._scihub_preferences.load_from_conf() self._scihub_preferences.showWindowPreferences.emit() @Slot(str) def rampage(self, input): """Download PDF with input Args: input: query or query list file path """ if os.path.exists(input): if is_text_file(input): self._query_list = deque() with open(input, 'rt') as f: for line in f: cleaned_line = line.strip() if cleaned_line != '': self._query_list.append(cleaned_line) self._query_list_length = len(self._query_list) self.rampage_query_list() else: self.log('<hr/>') self.log(self.tr('Query list file is not a text file!'), 'ERROR') elif is_range_query(input): self._query_list = deque(gen_range_query_list(input)) self._query_list_length = len(self._query_list) self.rampage_query_list() else: self.rampage_query(input) def rampage_query_list(self): """Download PDF with query list (self._query_list) """ if self._query_list and len(self._query_list) > 0: self.log('<hr/>') self.log( self.tr('Dealing with {}/{} query ...').format( self._query_list_length - len(self._query_list) + 1, self._query_list_length)) self.rampage_query(self._query_list.popleft()) def rampage_query(self, query): """Download PDF with query Args: query: Query of input """ scihub_api = SciHubAPI(query, callback=self.rampage_callback, rampage_type=SciHubRampageType.INPUT, conf=self._conf, log=self.log) self.beforeRampage.emit() scihub_api.start() def rampage_with_captcha(self, captcha_answer): """ Download PDF with captcha query (self._captcha_query) and captcha answer Args: captcha_answer: Captcha answer """ scihub_api = SciHubAPI( self._captcha_query, callback=self.rampage_callback, rampage_type=SciHubRampageType.PDF_CAPTCHA_RESPONSE, conf=self._conf, log=self.log, captcha_answer=captcha_answer) if self._captcha_img_file: self._captcha_img_file.close() self.beforeRampage.emit() scihub_api.start() def rampage_callback(self, res, err): """Callback function Args: res: Result from last round rampage err: Error """ if err == SciHubError.BLOCKED_BY_CAPTCHA: self.show_captcha(res) elif self._query_list: self.rampage_query_list() else: self.afterRampage.emit() def show_captcha(self, pdf_captcha_response): """Callback function for PDF captcha response Args: pdf_captcha_response: PDF captcha response """ self._captcha_query = pdf_captcha_response scihub_api = SciHubAPI('', log=self.log, conf=self._conf) _, captcha_img_url = scihub_api.get_captcha_info(pdf_captcha_response) self._captcha_img_file = scihub_api.download_captcha_img( captcha_img_url) captcha_img_local_url = 'file://' + self._captcha_img_file.name self._scihub_captcha.showWindowCaptcha.emit(captcha_img_local_url) def log(self, message, level=None): self.appendLogs.emit(message, level)
class SciHubEVA(QObject): beforeRampage = pyqtSignal() afterRampage = pyqtSignal() showErrorMessage = pyqtSignal(str, str) showInfoMessage = pyqtSignal(str, str) setSaveToDir = pyqtSignal(str) appendLogs = pyqtSignal(str) def __init__(self): super(SciHubEVA, self).__init__() self._conf = SciHubConf() self._engine = QQmlApplicationEngine() self._engine.load('qrc:/ui/SciHubEVA.qml') self._window = self._engine.rootObjects()[0] self._connect() self._scihub_preferences = SciHubPreferences(self._conf) self._scihub_captcha = SciHubCaptcha(self, log=self.log) self._captcha_query = None save_to_dir = self._conf.get('common', 'save_to_dir') if not save_to_dir or save_to_dir.strip() == '': self._save_to_dir = None else: self._save_to_dir = save_to_dir self.setSaveToDir.emit(save_to_dir) def _connect(self): # Connect QML signals to PyQt slots self._window.saveToDir.connect(self.saveToDir) self._window.rampage.connect(self.rampage) self._window.showWindowPreference.connect(self.showWindowPreferences) # Connect PyQt signals to QML slots self.beforeRampage.connect(self._window.beforeRampage) self.afterRampage.connect(self._window.afterRampage) self.showErrorMessage.connect(self._window.showErrorMessage) self.showInfoMessage.connect(self._window.showInfoMessage) self.setSaveToDir.connect(self._window.setSaveToDir) self.appendLogs.connect(self._window.appendLogs) @property def conf(self): return self._conf @pyqtSlot(str) def saveToDir(self, directory): self._save_to_dir = directory self._conf.set('common', 'save_to_dir', directory) @pyqtSlot() def showWindowPreferences(self): self._scihub_preferences.loadFromConf() self._scihub_preferences.showWindowPreferences.emit() @pyqtSlot(str) def rampage(self, input_query): """Download PDF with query of input Args: input_query: Query of input """ scihub_api = SciHubAPI(input_query, callback=self.rampage_callback, rampage_type=SciHubRampageType.INPUT, conf=self._conf, log=self.log) self.beforeRampage.emit() scihub_api.start() def rampageWithCaptchar(self, captcha_answer): """ Download PDF with captcha query (self._captcha_query) and captcha answer Args: captcha_answer: Captcha answer """ scihub_api = SciHubAPI( self._captcha_query, callback=self.rampage_callback, rampage_type=SciHubRampageType.PDF_CAPTCHA_RESPONSE, conf=self._conf, log=self.log, captcha_answer=captcha_answer) self.beforeRampage.emit() scihub_api.start() def rampage_callback(self, res, err): """Callback function Args: res: Result from last round rampage err: Error """ if err == SciHubError.BLOCKED_BY_CAPTCHA: self.captcha_callback(res) else: self.afterRampage.emit() def captcha_callback(self, pdf_captcha_response): """Callback function for PDF captcha response Args: pdf_captcha_response: PDF captcha response """ self._captcha_query = pdf_captcha_response _, captcha_img_url = SciHubAPI.get_captcha_info(pdf_captcha_response) self._scihub_captcha.showWindowCaptcha.emit(captcha_img_url) def log(self, message, level=None): if level: log_formater = '[{level}] - {message}' else: log_formater = '{message}' self.appendLogs.emit(log_formater.format(level=level, message=message))
class SciHubAPI(QObject, threading.Thread): def __init__(self, query, log, callback=None, rampage_type=None, conf=None, **kwargs): QObject.__init__(self) threading.Thread.__init__(self) self._query = query self.log = log self._callback = callback self._rampage_type = rampage_type # Captcha answer, used only when rampage_type == SciHubRampageType.PDF_CAPTCHA_RESPONSE if 'captcha_answer' in kwargs: self._captcha_answer = kwargs['captcha_answer'] if conf: self._conf = conf else: self._conf = SciHubConf('SciHubEVA.conf') self._sess = requests.Session() self._sess.headers = json.loads( self._conf.get('network', 'session_header')) retry_times = self._conf.getint('network', 'retry_times') retry = Retry(total=retry_times, read=retry_times, connect=retry_times) adapter = HTTPAdapter(max_retries=retry) self._sess.mount('http://', adapter) self._sess.mount('https://', adapter) self._set_http_proxy() self._doi_pattern = r'\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'])\S)+)\b' self._illegal_filename_pattern = r'[\/\\\:\*\?\"\<\>\|]' def _set_http_proxy(self): if self._conf.getboolean('proxy', 'enabled'): proxy_type = self._conf.get('proxy', 'type') proxy_host = self._conf.get('proxy', 'host') proxy_port = self._conf.get('proxy', 'port') proxy_username = self._conf.get('proxy', 'username') proxy_password = self._conf.get('proxy', 'password') proxy = proxy_type + '://' if proxy_username and proxy_username != '': proxy += proxy_username if proxy_password and proxy_password != '': proxy += proxy_password if proxy_username and proxy_username != '': proxy += '@' proxy += proxy_host if proxy_port and proxy_port != '': proxy += ':' + proxy_port self._sess.proxies = {'http': proxy, 'https': proxy} @staticmethod def get_pdf_metadata(pdf): """Get PDF metadata with PDF content Args: pdf: PDF content (in bytes) Returns: metadata: PDF metadata dictionary """ temp_pdf_file = tempfile.TemporaryFile() temp_pdf_file.write(pdf) metadata = { 'author': 'UNKNOWN_AUTHOR', 'title': 'UNKNOWN_TITLE', 'year': 'UNKNOWN_YEAR' } pdf_parser = PDFParser(temp_pdf_file) try: pdf_doc = PDFDocument(pdf_parser) pdf_metadata = pdf_doc.info[0] author = make_pdf_metadata_str(pdf_metadata.get('Author', '')) if author and author != '': metadata['author'] = author title = make_pdf_metadata_str(pdf_metadata.get('Title', '')) if title and title != '': metadata['title'] = title year = pdf_metadata_moddate_to_year( make_pdf_metadata_str(pdf_metadata.get('ModDate', ''))) if year and year != '': metadata['year'] = year except Exception as e: pass temp_pdf_file.close() return metadata def guess_query_type(self, query): """Guess query type Args: query: Query Returns: query_type: Query type """ if query.startswith('http') or query.startswith('https'): if query.endswith('pdf'): query_type = 'pdf' else: query_type = 'url' elif query.isdigit(): query_type = 'pmid' elif query.startswith('doi:') or re.match(self._doi_pattern, query): query_type = 'doi' else: query_type = 'string' self.log(self.tr('Query type: ') + query_type.upper(), 'INFO') return query_type def get_captcha_info(self, pdf_captcha_response): """Get captcha information with PDF captcha response Args: pdf_captcha_response: PDF captcha response Returns: captcha_id: Captcha ID captcha_img_url: Captcha image URL """ captcha_id, captcha_img_url = None, None html = etree.HTML(pdf_captcha_response.content) imgs = html.xpath('//img[@id="captcha"]') ids = html.xpath('//input[@name="id"]') if len(imgs) > 0 and len(ids) > 0: captcha_id = ids[0].attrib['value'] captcha_img_src = imgs[0].attrib['src'] if captcha_img_src.startswith('http'): captcha_img_url = captcha_img_src else: scheme, netloc, *_ = urlparse(pdf_captcha_response.url, scheme='http') captcha_img_url = scheme + '://' + netloc + captcha_img_src return captcha_id, captcha_img_url def download_captcha_img(self, captcha_img_url): """ Download captcha image Args: captcha_img_url: Captcha image URL Returns: Captcha image file """ captcha_img_file = NamedTemporaryFile() captcha_img_res = self._sess.get(captcha_img_url, stream=True) if captcha_img_res.status_code == 200: for chuck in captcha_img_res: captcha_img_file.write(chuck) captcha_img_file.flush() return captcha_img_file def fetch_pdf_with_captcha(self, pdf_captcha_response): """Fetch PDF with captcha Args: pdf_captcha_response: PDF captcha response Returns: pdf: PDF content (in bytes) err: Error """ pdf, err = None, None captcha_id, _ = self.get_captcha_info(pdf_captcha_response) pdf_response = self._sess.post( pdf_captcha_response.url, data={ 'answer': self._captcha_answer, 'id': captcha_id }, verify=False, timeout=self._conf.getfloat('network', 'timeout') / 1000.0) if pdf_response.headers['Content-Type'] == 'application/pdf': self.log(self.tr('Angel [CAPTCHA] down!'), 'INFO') pdf = pdf_response.content else: err = SciHubError.WRONG_CAPTCHA return pdf, err def fetch_pdf(self, pdf_url): """ Fetch PDF with PDF URL Args: pdf_url: PDF URL Returns: pdf: PDF (in bytes) or PDF captcha response (when downloading is blocked by captcha) err: Error """ self.log(self.tr('Fetching PDF ...'), 'INFO') pdf, err = None, None pdf_response = self._sess.get( pdf_url, verify=False, timeout=self._conf.getfloat('network', 'timeout') / 1000.0) if pdf_response.headers['Content-Type'] == 'application/pdf': pdf = pdf_response.content elif pdf_response.headers['Content-Type'].startswith('text/html'): self.log(self.tr('Angel [CAPTCHA] is coming!'), 'WARN') err = SciHubError.BLOCKED_BY_CAPTCHA pdf = pdf_response else: self.log(self.tr('Unknown PDF Content-Type!'), 'ERROR') return pdf, err def fetch_pdf_url(self, query): """Fetch PDF URL with query Args: query: Query Returns: pdf_url: PDF URL err: Error """ scihub_url = self._conf.get('network', 'scihub_url') self.log( self.tr('Using Sci-Hub URL: ') + '<a href="{scihub_url}">{scihub_url}</a>'.format( scihub_url=scihub_url), 'INFO') query_type = self.guess_query_type(query) pdf_url = query err = None if query_type != 'pdf': try: self.log(self.tr('Fetching PDF URL ...'), 'INFO') pdf_url_response = self._sess.post( scihub_url, data={'request': query}, verify=False, timeout=self._conf.getfloat('network', 'timeout') / 1000.0) html = etree.HTML(pdf_url_response.content) iframes = html.xpath( '//iframe[@id="pdf"]') if html is not None else None if iframes and len(iframes) > 0: pdf_url = urlparse(iframes[0].attrib['src'], scheme='http').geturl() pdf_url_html = '<a href="{pdf_url}">{pdf_url}</a>'.format( pdf_url=pdf_url) self.log(self.tr('Got PDF URL: ') + pdf_url_html, 'INFO') else: err = SciHubError.NO_VALID_IFRAME request_url = '{scihub_url}/{query}'.format( scihub_url=scihub_url, query=query) request_url_html = '<a href="{request_url}">{request_url}</a>'.format( request_url=request_url) response_url = pdf_url_response.url response_url_html = '<a href="{response_url}">{response_url}</a>'.format( response_url=response_url) self.log(self.tr('Failed to get PDF URL!'), 'ERROR') self.log(self.tr('No valid <iframe>!'), 'ERROR') self.log(self.tr('You may need handle it manually.'), 'INFO') self.log( self.tr('Request URL: ') + request_url_html, 'INFO') self.log( self.tr('Response URL: ') + response_url_html, 'INFO') except Exception as e: err = SciHubError.UNKNOWN self.log(self.tr('Failed to get PDF URL!'), 'ERROR') self.log(str(e), 'ERROR') return pdf_url, err def save_pdf(self, pdf, filename): """Save pdf to local Args: pdf: PDF content (in bytes) filename: PDF filename """ pdf_name_formatter = self._conf.get( 'common', 'filename_prefix_format') + '_' + filename pdf_metadata = self.get_pdf_metadata(pdf) pdf_name = pdf_name_formatter.format(**pdf_metadata) pdf_path = os.path.join(self._conf.get('common', 'save_to_dir'), pdf_name) with open(pdf_path, 'wb') as fp: fp.write(pdf) pdf_link = '<a href="file:///{pdf_path}">{pdf_path}</a>'.format( pdf_path=pdf_path) self.log(self.tr('Saved PDF as: ') + pdf_link, 'INFO') def rampage(self, query, rampage_type): """Main process of downloading PDF Args: query: Query (input, response of fetching PDF, ...) rampage_type: Rampage type Returns: res: Result of rampage, maybe used for next steps err: Error of rampage e.g. (None, None), (pdf_captcha_response, SciHubError.BLOCKED_BY_CAPTCHA), ... """ if rampage_type == SciHubRampageType.INPUT: # Query is user input self.log('<hr/>') self.log(self.tr('Dealing with query: ') + query, 'INFO') # Fetch PDF URL pdf_url, err = self.fetch_pdf_url(query) if err is not None: return None, err # Fetch PDF pdf, err = self.fetch_pdf(pdf_url) if err == SciHubError.BLOCKED_BY_CAPTCHA: return pdf, err elif err is not None: return None, err # Save PDF filename = urlparse(pdf_url).path[1:].split('/')[-1] self.save_pdf(pdf, filename) elif rampage_type == SciHubRampageType.PDF_CAPTCHA_RESPONSE: # Query is PDF captcha response (with answer) # Fetch PDF with Captcha pdf, err = self.fetch_pdf_with_captcha(query) if err == SciHubError.WRONG_CAPTCHA: self.log( self.tr('Wrong captcha, failed to kill Angel [CAPTCHA]!'), 'ERROR') return None, err # Save PDF filename = urlparse(query.url).path[1:].split('/')[-1] self.save_pdf(pdf, filename) return None, None def run(self): res, err = self.rampage(self._query, self._rampage_type) self._callback(res, err)
class SciHubEVA(QObject): beforeRampage = Signal() afterRampage = Signal() loadSaveToDir = Signal(str) appendLogs = Signal(str, str) def __init__(self): super(SciHubEVA, self).__init__() self._conf = SciHubConf('SciHubEVA.conf') self._qt_quick_controls2_conf = SciHubConf( 'qtquickcontrols2.conf', space_around_delimiters=False) self._engine = QQmlApplicationEngine() self._engine.rootContext().setContextProperty( 'PYTHON_VERSION', '.'.join(str(v) for v in sys.version_info[:3])) self._engine.rootContext().setContextProperty( 'QT_VERSION', PySide2.QtCore.qVersion()) self._engine.load('qrc:/ui/SciHubEVA.qml') self._window = self._engine.rootObjects()[0] self._connect() self._scihub_preferences = SciHubPreferences( self._conf, self._qt_quick_controls2_conf) self._scihub_captcha = SciHubCaptcha(self, log=self.log) self._captcha_query = None self._input = None save_to_dir = self._conf.get('common', 'save_to_dir') if not save_to_dir or save_to_dir.strip() == '': self._save_to_dir = None else: self._save_to_dir = save_to_dir self.loadSaveToDir.emit(save_to_dir) self._query_list = None self._query_list_length = 0 self._captcha_img_file_path = None self._logger = logging.getLogger('SciHubEVA') self._logger.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s - %(levelname)s - %(message)s') log_file_name_prefix = str(get_log_directory() / 'SciHubEVA.log') handler = TimedRotatingFileHandler(filename=log_file_name_prefix, when='D') handler.setFormatter(formatter) handler.setLevel(logging.DEBUG) self._logger.addHandler(handler) self._h2t = html2text.HTML2Text() self._h2t.ignore_links = True def _connect(self): # Connect QML signals to PyQt slots self._window.setSaveToDir.connect(self.setSaveToDir) self._window.openSaveToDir.connect(self.openSaveToDir) self._window.rampage.connect(self.rampage) self._window.showWindowPreference.connect(self.showWindowPreference) self._window.openLogFile.connect(self.openLogFile) self._window.openLogDirectory.connect(self.openLogDirectory) # Connect PyQt signals to QML slots self.beforeRampage.connect(self._window.beforeRampage) self.afterRampage.connect(self._window.afterRampage) self.loadSaveToDir.connect(self._window.loadSaveToDir) self.appendLogs.connect(self._window.appendLogs) @property def conf(self): return self._conf @Slot(str) def setSaveToDir(self, directory): self._save_to_dir = directory self._conf.set('common', 'save_to_dir', directory) @Slot(str) def openSaveToDir(self, directory): if os.path.exists(directory): open_directory(directory) @Slot() def showWindowPreference(self): self._scihub_preferences.load_from_conf() self._scihub_preferences.showWindowPreferences.emit() @Slot() def openLogFile(self): open_file(str(get_log_directory() / 'SciHubEVA.log')) @Slot() def openLogDirectory(self): open_directory(str(get_log_directory())) @Slot(str) def rampage(self, input): """Download PDF with input Args: input: query or query list file path """ self._input = input if os.path.exists(input): if is_text_file(input): self._query_list = deque() with open(input, 'rt') as f: for line in f: cleaned_line = line.strip() if cleaned_line != '': self._query_list.append(cleaned_line) self._query_list_length = len(self._query_list) self.rampage_query_list() else: self.log('<hr/>') self.log(self.tr('Query list file is not a text file!'), logging.ERROR) elif is_range_query(input): self._query_list = deque(gen_range_query_list(input)) self._query_list_length = len(self._query_list) self.rampage_query_list() else: self.rampage_query(input) def rampage_query_list(self): """Download PDF with query list (self._query_list) """ if self._query_list and len(self._query_list) > 0: self.log('<hr/>') self.log( self.tr('Dealing with {}/{} query ...').format( self._query_list_length - len(self._query_list) + 1, self._query_list_length)) self.rampage_query(self._query_list.popleft()) def rampage_query(self, query): """Download PDF with query Args: query: Query of input """ scihub_api = SciHubAPI(self._input, query, callback=self.rampage_callback, rampage_type=SciHubRampageType.INPUT, conf=self._conf, log=self.log) self.beforeRampage.emit() scihub_api.start() def rampage_with_captcha(self, captcha_answer): """ Download PDF with captcha query (self._captcha_query) and captcha answer Args: captcha_answer: Captcha answer """ if os.path.exists(self._captcha_img_file_path) and os.path.isfile( self._captcha_img_file_path): os.remove(self._captcha_img_file_path) scihub_api = SciHubAPI( self._input, self._captcha_query, callback=self.rampage_callback, rampage_type=SciHubRampageType.PDF_CAPTCHA_RESPONSE, conf=self._conf, log=self.log, captcha_answer=captcha_answer) self.beforeRampage.emit() scihub_api.start() def rampage_callback(self, res, err): """Callback function Args: res: Result from last round rampage err: Error """ if err == SciHubError.BLOCKED_BY_CAPTCHA: self.show_captcha(res) elif self._query_list: self.rampage_query_list() else: self.afterRampage.emit() def show_captcha(self, pdf_captcha_response): """Callback function for PDF captcha response Args: pdf_captcha_response: PDF captcha response """ self._captcha_query = pdf_captcha_response scihub_api = SciHubAPI(self._input, None, log=self.log, conf=self._conf) _, captcha_img_url = scihub_api.get_captcha_info(pdf_captcha_response) captcha_img_file = scihub_api.download_captcha_img(captcha_img_url) self._captcha_img_file_path = Path(captcha_img_file.name).as_posix() captcha_img_local_uri = Path(captcha_img_file.name).as_uri() self._scihub_captcha.showWindowCaptcha.emit(captcha_img_local_uri) def log(self, message: str, level=None): self.appendLogs.emit(message, logging.getLevelName(level) if level else '') text_message = self._h2t.handle(message).strip() if text_message and text_message != '': self._logger.log(level if level else logging.INFO, text_message)