def getSparkLogger(cls, logfile="", conffile=""): if (logfile == ""): logfile = Conf.getconf("logdir", conffile=conffile) + Conf.getconf( "logfile", conffile=conffile) from pyspark import SparkContext #sc = SparkContext.getOrCreate() #log4j = sc._jvm.org.apache.log4j #logger = log4j.LogManager.getLogger("myLogger") #logger = log4j.LogManager.getLogger(__name__) #logger = log4j.LogManager.getRootLogger() #logger.appender.FILE.File="../../var/log/log" #logger.setLevel(log4jLogger.Level.DEBUG) #logger.info("aaaa") #return logger if not 'LOG_DIRS' in os.environ: sys.stderr.write( 'Missing LOG_DIRS environment variable, pyspark logging disabled' ) return file = os.environ['LOG_DIRS'].split(',')[0] + '/log' logging.basicConfig( filename=file, level=logging.INFO, format= '%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s' ) logger = logging.getLogger() return logger
def getLogger(cls, logfile="", conffile=""): from conf import Conf if (logfile == ""): logfile = Conf.getconf("logdir", conffile=conffile) + Conf.getconf( "logfile", conffile=conffile) loglevel = Conf.getconf("loglevel", conffile=conffile) rotate_log_size = Conf.getconf("rotate_log_size") import logging, logging.handlers logger = logging.getLogger() if len(logger.handlers) < 1: #fh = logging.FileHandler(filename="../../var/log/log2") #logger.addHandler(fh) rfh = logging.handlers.RotatingFileHandler( filename=logfile, maxBytes=rotate_log_size, backupCount=Conf.getconf("backup_log_count")) formatter = logging.Formatter( '%(asctime)s - %(levelname)s - %(message)s') rfh.setFormatter(formatter) logger.addHandler(rfh) #logging.basicConfig(filename="../../var/log/log2") id_ = id(logger) logger.setLevel(eval("logging." + loglevel)) logger.debug( "return logger\n logfile[{logfile}]\n rotate_log_size[{rotate_log_size}]\n id[{id_}]" .format(**locals())) return logger
def __init__(self): self.log = Log.getLogger() self.driver = self.create_driver() self.top_url = Conf.getconf("kakaku_top_page") self.target_stores = Conf.getconf("target_stores") self.extract_store_name = re.compile(r"\'") self.warning_messages = False self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start")
def move_to_vendor_page(self, vendor_button): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start.") self.driver.get(vendor_button.get_attribute("href"), warning_messages=self.warning_messages) self.log.debug("wait start") for sec in range(Conf.getconf("phantomJS_load_timeout")): self.log.debug("wait redirect " + str(sec) + "[sec]") if self.driver.title: self.log.debug("move to shop page finished. page title: " + self.driver.title) break time.sleep(Conf.getconf("vendor_page_wait_time")) self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished.")
def save_current_page(self, filename): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") path, suffix = os.path.splitext(filename) max_filename_length = Conf.getconf("max_filename_length") if len(path) > max_filename_length: self.log.debug("filename too long. convert from :" + filename) filename = path[:max_filename_length] + suffix self.log.debug("to :" + filename) self.log.debug("path[" + path + "], suffix[" + suffix + "]") if suffix == ".html": f = open(filename, 'w') f.write(self.page_source) f.close() elif suffix == ".png": self.save_screenshot(filename) elif suffix == ".pdf": pngname = os.path.splitext(filename)[0] + ".png" self.save_screenshot(pngname) self.convert_png_to_pdf(pngname) else: self.log.error(__class__.__name__ + "." + sys._getframe().f_code.co_name) self.log.error("TYPEERROR suffix[" + suffix + "]") self.log.debug("saved to " + filename) self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished")
def getLogger(cls, logfile="", conffile=""): exec_env = Conf.getconf("exec_env") if exec_env == "normal": return cls.getNormalLogger(logfile=logfile, conffile=conffile) elif exec_env == "spark": return cls.getSparkLogger(logfile=logfile, conffile=conffile) else: sys.exit("getLogger Type Error[" + str(exec_env) + "]")
def getNormalLogger(cls, logfile="", conffile=""): if logfile == "": logfile = Conf.getconf("logdir", conffile=conffile) + Conf.getconf( "logfile", conffile=conffile) logger = logging.getLogger(logfile) if len(logger.handlers) > 1: # called before and already created. return logger loglevel = Conf.getconf("loglevel", conffile=conffile) rotate_log_size = Conf.getconf("rotate_log_size") if len(logger.handlers) < 1: rfh = logging.handlers.RotatingFileHandler( filename=logfile, maxBytes=rotate_log_size, backupCount=Conf.getconf("backup_log_count")) formatter = logging.Formatter( '%(asctime)s - %(levelname)s - %(message)s') rfh.setFormatter(formatter) logger.addHandler(rfh) if Conf.getconf("loglevel_to_stdout", conffile=conffile): stream_handler = logging.StreamHandler() stream_handler.setFormatter(formatter) stream_handler.setLevel(\ Conf.getconf("loglevel_to_stdout", conffile=conffile)) logger.addHandler(stream_handler) id_ = id(logger) logger.setLevel(eval("logging." + loglevel)) logger.debug( "return normal logger\n logfile[{logfile}]\n rotate_log_size[{rotate_log_size}]\n id[{id_}]" .format(**locals())) return logger
def establish_session(self): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") import sqlalchemy from conf import Conf self.engine = sqlalchemy.create_engine(Conf.getconf("myslq_url"), echo=False) from sqlalchemy.orm import sessionmaker Session = sessionmaker(bind=self.engine) session = Session() session.expire_on_commit = False self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") return session
def __init__(self, executable_path="",\ port=0, desired_capabilities=DesiredCapabilities.PHANTOMJS,\ service_args=None, service_log_path=None): self.executable_path = executable_path self.port = port self.PHANTOMJS = desired_capabilities self.service_args = service_args self.service_log_path = service_log_path self.log = Log.getLogger() self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") import logging, logging.handlers selenium_logger = logging.getLogger( 'selenium.webdriver.remote.remote_connection') selenium_logger.setLevel(logging.ERROR) if len(selenium_logger.handlers) < 1: rfh = logging.handlers.RotatingFileHandler( filename=Conf.getconf("logdir") + Conf.getconf("phantomjs_logfile"), maxBytes=Conf.getconf("rotate_log_size"), backupCount=Conf.getconf("backup_log_count")) formatter = logging.Formatter( '%(asctime)s - %(levelname)s - %(message)s') rfh.setFormatter(formatter) selenium_logger.addHandler(rfh) stream_handler = logging.StreamHandler() stream_handler.setFormatter(formatter) stream_handler.setLevel(Conf.getconf("loglevel_to_stdout")) selenium_logger.addHandler(stream_handler) if self.executable_path == "": self.executable_path = Conf.getconf("phantomJS_pass") if self.service_args == None: self.service_args = ["--webdriver-loglevel=DEBUG"] if self.service_log_path == None: self.service_log_path = Conf.getconf("logdir") + Conf.getconf( "phantomjs_logfile") self.log.debug(__class__.__name__ + ".super().__init__ start") super().__init__(executable_path=self.executable_path, \ port=self.port, desired_capabilities=self.PHANTOMJS, \ service_args=self.service_args, service_log_path=self.service_log_path) self.set_page_load_timeout(Conf.getconf("phantomJS_load_timeout"))
def save_cheapest_pdf(self, product_name, logger=None): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start.") self.log.debug("product_name[" + product_name + "]") if logger: self.log.debug("change logger") self.log = logger self.driver.log = logger print("move to kakaku.com") self.move_to_top_page() print("search product. name[" + product_name + "]") search_results = self.search_product(product_name) if len(search_results) > 1: self.log.warn("search results of product_name[" + str(product_name) + "] = " + str(len(search_results)) + " > 1.") self.log.warn("use only first result.") print("click top of search result") if not self.driver.click(search_results[0], warning_messages=self.warning_messages): self.log.error("click failed. Please retry.") #exit(1) raise Exception #tag = '//td[@class="fRed"]/p[@class="wordwrapTrs"]/a' tag = '//p[@class="wordwrapShop"]/a' self.driver.wait_appearance_of_tag(by="xpath", tag=tag) print("get cheapest vendor") cheapest_vendor, vendor_name = self.get_cheapest_vendor_button( product_name) print("move_to_vendor_page") self.move_to_vendor_page(cheapest_vendor) path = Conf.getconf("pdf_save_path") print("save as " + path + "/" + product_name + "|" + vendor_name + ".pdf") self.driver.save_current_page(path + "/" + product_name + "|" + vendor_name + ".pdf") self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished.")
def get(self, url, tag_to_wait="", by="xpath", timeout="default", warning_messages=True): if timeout == "default": timeout = self.load_timeout retries = 10 while retries > 0: try: self.log.debug("super().get(" + url + ") start") super().get(url) break except RemoteDisconnected as e: self.log.debug("PhantomJS caught RemoteDisconnected at get" + url) self.log.debug("%s", e) self.log.debug("retries[" + str(retries) + "]") super().__init__(executable_path=self.executable_path, \ port=self.port, desired_capabilities=self.PHANTOMJS, \ service_args=self.service_args, service_log_path=self.service_log_path, logger=self.log) retries -= 1 except TimeoutException as e: self.save_error_messages_at(sys._getframe().f_code.co_name, "by[" + by + "], tag[" + tag_to_wait + "]", warning_messages, e, url=url) self.execute_script("window.stop();") if retries == 0: self.log.error("PhantomJS caught ERROR RemoteDisconnected at get" + url) self.save_current_page("../../var/ss/get_error.html") self.save_current_page("../../var/ss/get_error.png") wait_time = Conf.getconf("phantomJS_wait_time_per_get") self.log.debug("get finished. wait " + str(wait_time) + " seconds") time.sleep(wait_time) if tag_to_wait != "": self.wait_appearance_of_tag(by=by, tag=tag_to_wait, timeout=timeout)
class IEEEXplore: def __init__(self): sys.path.append( os.path.dirname(os.path.abspath(__file__)) + "/../lib/utils") from conf import Conf self.conf = Conf() from log import Log as l self.log = l.getLogger() self.opt = Search_options() self.log.debug("class " + __class__.__name__ + " created.") def get_papers_of_new_conferences(self, conference_num): self.log.info(__class__.__name__ + "." + sys._getframe().f_code.co_name + "(conference_num=" + str(conference_num) + ") start.") def get_papers_by_keywords(self, keywords, num_of_papers="all", search_options="default", path="../../data/tmp/", filename="title", timeout=30): self.log.info(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") self.log.info("keywords[" + keywords + "], num_of_papers[" + str(num_of_papers) + "]") driver = self.create_driver(timeout=timeout) if search_options == "default": search_options = Search_options() self.search_by_keywords(driver, keywords, search_options=search_options, timeout=timeout) if num_of_papers == "all": element = driver.find_element_by_css_selector( 'div[class="pure-u-1-1 Dashboard-header ng-scope"] > span') num_of_papers = int(element.text.split(" ")[-1].replace(",", "")) self.log.debug("num_of_papers[" + str(num_of_papers) + "]") urls = self.get_urls_of_papers_in_keywords_page( driver, search_options.PerPage, num_of_papers, timeout) print("urls.size[" + str(len(urls)) + "]") all_papers = [] all_citing_urls = [] all_cited_urls = [] sys.path.append( os.path.dirname(os.path.abspath(__file__)) + "/../../lib/math") from searchs import Searchs search = Searchs(limit=num_of_papers) for url in urls: search.node = url paper, citing_urls, cited_urls = self.get_attributes_and_download_pdf( search, driver, path=path, filename=filename) print("paper.title[" + paper.title + "]") all_papers.append(paper) all_citing_urls.extend(citing_urls) all_cited_urls.extend(cited_urls) self.log.info(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") return all_papers, urls, all_citing_urls, all_cited_urls def get_papers_of_target_conference(self, conference_name): pass def create_driver(self, url="", timeout=30): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") self.log.debug("url[" + url + "]") if url == "" or url == self.conf.getconf("IEEE_top_page"): url = self.conf.getconf("IEEE_top_page") sys.path.append( os.path.dirname(os.path.abspath(__file__)) + "/../../lib/scraping") from phantomjs_ import PhantomJS_ driver = PhantomJS_(desired_capabilities={ 'phantomjs.page.settings.resourceTimeout': timeout }) self.log.debug("driver.get(" + url + ")") driver.get(url, tag_to_wait='//li[@class="Media-articles-item"]', by="xpath", timeout=timeout) self.log.debug("driver.get finished") """ if url == self.conf.getconf("IEEE_top_page"): self.log.debug("Wait start.") try: WebDriverWait(driver, timeout).until(lambda driver: driver.find_element_by_xpath('//li[@class="Media-articles-item"]')) except TimeoutException: self.log.warning("caught TimeoutException at load the iEEE top page.") except NoSuchElementException: self.log.warning("caught NoSuchElementException at load the iEEE top page.") self.log.debug("Wait Finished.") """ self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished. return driver") return driver def wait_search_results(self, driver, timeout=30): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") self.log.debug("Wait start.") try: tag = '//input[@type="checkbox" and @data-group="search-results-group" and @ng-checked="vm.allSelected()"]' WebDriverWait(driver, timeout).until( lambda driver: driver.find_element_by_xpath(tag)) except TimeoutException: self.log.warning( "caught TimeoutException at load the keywords results page.") self.log.warning("at " + sys._getframe().f_code.co_name) self.log.warning("url[" + driver.current_url + "]") self.log.warning("tag[find_element_by_xpath(" + tag + ")") filename = "./samples/TimeoutExceptionatLoadtheKeywordsResultsPage." + re.sub( r"/|:|\?", "", driver.current_url) self.save_current_page(driver, filename + ".png") self.save_current_page(driver, filename + ".html") except NoSuchElementException: self.log.warning( "caught NoSuchElementException at load the keywords results page." ) self.log.debug("Wait Finished.") self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") return 0 def search_by_keywords(self, driver, keywords, search_options="default", timeout=30): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") driver.wait_appearance_of_tag(by="name", tag='queryText', timeout=timeout) try: driver.find_element_by_name('queryText').send_keys(keywords) driver.find_element_by_class_name('Search-submit').click() except (Exception) as e: self.log.exception('[[EXCEPTON OCCURED]]: %s', e) sys.exit("[[EXCEPTON OCCURED]]please check logfile.") self.wait_search_results(driver, timeout) self.set_options(driver, search_options, timeout) self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") return 0 def set_options(self, driver, search_options, timeout=30): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") try: #self.save_current_page(driver, "./samples/before_set_options.png") #self.save_current_page(driver, "./samples/before_set_options.html") ##show" : "All Resul ##PerPage" : "25" if search_options.PerPage != 25: element = driver.find_element_by_css_selector( 'div[ng-model="vm.rowsPerPage"] > div > select') #print(len(element)) #print(element.text) Select(element).select_by_visible_text( str(search_options.PerPage)) self.wait_search_results(driver, timeout) #Select(element).select_by_value("object:75") ##SortBy" : "MostCit ##ContentType" : "No ##YearType" : "Range ##YearFrom" : "1996" ##YearTo" : "2017", ##Year" : "2017", ##Author" : "None", ##Affiliation" : "No ##PublicationTitle" ##Publisher" : "None ##ConferenceLocation except NoSuchElementException: print("caught NoSuchElementException at get_citing_papers.") self.save_current_page( driver, "./samples/aNoSuchElementException_in_set_options.png") self.save_current_page( driver, "./samples/NoSuchElementException_in_set_options.html") #self.save_current_page(driver, "./samples/after_set_options.png") #self.save_current_page(driver, "./samples/after_set_options.html") self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") def get_urls_of_papers_in_keywords_page(self, driver, PerPage, num_of_papers="all", timeout=30): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") if num_of_papers == "all": element = driver.find_element_by_css_selector( 'div[class="pure-u-1-1 Dashboard-header ng-scope"] > span') num_of_papers = int(element.text.split(" ")[-1].replace(",", "")) self.log.debug("num_of_papers[" + str(num_of_papers) + "]") urls = [] next_button = driver.find_element_by_xpath( '//a[@href="" and @ng-click="selectPage(page.number)" and @class="ng-binding"]' ) visited_buttons = [next_button.text] while True: self.log.debug("get paper urls in current page") for i in range(PerPage): paper_elements = driver.find_elements_by_xpath( '//div[@class="js-displayer-content u-mt-1 stats-SearchResults_DocResult_ViewMore ng-scope hide"]' ) self.log.debug("i[" + str(i) + "] len(paper_elements)[" + str(len(paper_elements)) + "]") driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") if len(paper_elements) == PerPage: break self.log.debug("len(paper_elements)[" + str(len(paper_elements)) + "]") for paper_element in paper_elements: url = paper_element.find_element_by_css_selector( 'a').get_attribute("href") self.log.debug("url[" + url + "]") urls.append(url) if len(urls) > num_of_papers: self.log.debug("len(urls)[" + str(len(urls)) + "] > num_of_papers[" + str(num_of_papers) + "]. return urls.") return urls self.log.debug("search buttons to next page") buttons = driver.find_elements_by_xpath( '//a[@href="" and @ng-click="selectPage(page.number)" and @class="ng-binding"]' ) i = 0 for button in buttons: self.log.debug("i[" + str(i) + "], button.text[" + button.text + "], visited_buttons:" + str(visited_buttons)) if not button.text in visited_buttons: next_button = button self.log.debug("break") break i += 1 if i == len(buttons): self.log.debug( "i = len(buttons). already visited all buttons. break") break visited_buttons.append(next_button.text) self.log.debug("move to next page[" + next_button.text + "]") next_button.click() self.wait_search_results(driver, timeout) self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished. return urls[" + str(len(urls)) + "]") return urls def get_attributes_and_download_pdf(self, search, driver, path="../../data/tmp/", filename="title"): print(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") self.log.info(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") sys.path.append( os.path.dirname(os.path.abspath(__file__)) + "/../../lib/db") search.times += 1 timeout = 30 target_paper_url = search.node m = "url[" + target_paper_url + "], times[" + str( search.times) + "], limit[" + str(search.limit) + "]" print(m) self.log.info(m) ##reconnect because of http.client.RemoteDisconnected #if search.times % 5 == 0: # driver = self.reconnect_driver(driver, driver.current_url) #self.save_current_page(driver, "./samples/tmp.png") ##if this paper already downloaded, this paper visited and skip. #if target_paper_url in search.visited: self.move_to_paper_initial_page(driver, target_paper_url) import table_papers paper = table_papers.Table_papers() self.log.debug("get attributes of this paper") paper.title = self.get_title(driver) paper.authors = self.get_authors(driver) paper.keywords = self.get_keywords(driver) paper.citings, citing_papers, citing_urls = self.get_citing_papers( driver, timeout) paper.citeds, cited_papers, cited_urls = self.get_cited_papers( driver, timeout) paper.conference = self.get_conference(driver) paper.published = self.get_date_of_publication(driver) paper.url = target_paper_url paper.timestamp = time.strftime('%Y-%m-%d %H:%M:%S') if filename == "title": filename = paper.title + ".pdf" paper.path = self.download_a_paper(driver, path=path, filename=filename, timeout=timeout) self.log.debug("download finished. wait start.") time.sleep(self.conf.getconf("IEEE_wait_time_per_download_paper")) self.log.debug("wait finished.") paper.id = paper.get_id() self.log.debug(paper.get_vars()) paper.renewal_insert() self.log.debug("insert citations of this paper to db") import table_citations for citing_paper in citing_papers: citation = table_citations.Table_citations(start=paper.id, end=citing_paper.id) citation.renewal_insert() citation.close() for cited_paper in cited_papers: citation = table_citations.Table_citations(start=cited_paper, end=paper.id) citation.renewal_insert() citation.close() self.log.debug("check termination of searching loop") if 0 < search.limit and search.times >= search.limit: self.log.debug("search finished.") search.que = [search.node] import signal driver.service.process.send_signal( signal.SIGTERM) # kill the specific phantomjs child proc driver.quit() # quit the node proc return paper, [], [] self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") self.log.debug("return paper[" + paper.title + "] citing_urls[" + str(citing_urls) + "] cited_urls[" + str(cited_urls) + "]") return paper, citing_urls, cited_urls def get_title(self, driver): return driver.title def get_authors(self, driver): authors_str = "" elements = driver.find_elements_by_xpath( '//span[@ng-bind-html="::author.name"]') for el in elements: authors_str += "," + el.text return authors_str[1:] def get_keywords(self, driver): ##keywords keywords_str = "" elements = driver.find_elements_by_xpath('//a[@ng-bind-html="::term"]') #print(str(len(elements))) #21 for el in elements: keyword = el.text if keyword in keywords_str: ##todo internet concludes int self.log.debug("keyword[" + keyword + "] is deplicated. not add.") else: keywords_str += "," + el.text return keywords_str def get_citing_papers(self, driver, timeout=30): ##citing_papers ##citing_urls import table_papers citings_str = "" citing_papers = [] citing_urls = [] try: elements = driver.find_elements_by_css_selector( 'div[ng-repeat="article in vm.contextData.similar"]') except NoSuchElementException: self.log.warning( "caught NoSuchElementException at get_citing_papers.") self.log.debug(str(len(elements))) #self.save_current_page(driver, "./samples/sample_page_4116687_start.html") #self.save_current_page(driver, "./samples/sample_page_4116687_start.png") self.log.debug("create arrays of paper and url") for el in elements: citing_paper = table_papers.Table_papers() citing_paper.url = self.conf.getconf( "IEEE_website") + el.find_element_by_css_selector( 'a').get_attribute("ng-href") citing_paper.title = el.find_element_by_css_selector( 'a').get_attribute("title") citing_paper.authors = el.find_element_by_css_selector( 'div[class="ng-binding"]').text.replace(";", ",") timestamp = time.strftime('%Y-%m-%d %H:%M:%S') self.log.debug("citing_url[" + citing_paper.url + "]") self.log.debug("citing_title[" + citing_paper.title + "]") self.log.debug("citing_authors[" + citing_paper.authors + "]") self.log.debug(citing_paper.get_vars()) citing_paper.renewal_insert() citing_papers.append(citing_paper) citing_urls.append(citing_paper.url) return citings_str, citing_papers, citing_urls def get_cited_papers(self, driver, timeout=30): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") import table_papers citeds_str = "" cited_papers = [] cited_urls = [] initial_url = driver.current_url driver.get(self.convert_paper_url_to_cited_url(initial_url)) #self.save_current_page(driver, "./samples/sample_page_1055638_start.html") #self.save_current_page(driver, "./samples/sample_page_1055638_start.png") try: div = driver.find_element_by_css_selector( 'div > section[class="document-all-references ng-scope"] > div[class="ng-scope"] > div[class="strong"]' ).text if div == "Citations not available for this document.": self.log.debug("this paper not cited. return []") return citeds_str, cited_papers, cited_urls self.log.debug("div=" + div + ", this paper is cited") except NoSuchElementException: self.log.debug("this paper is cited") self.log.debug( "WebDriverWait(driver, timeout).until(lambda driver: driver.find_element_by_xpath('//b[@class=ng-binding]' start" ) try: WebDriverWait( driver, timeout).until(lambda driver: driver.find_element_by_xpath( '//b[@class="ng-binding"]')) except TimeoutException: self.log.warning( "caught TimeoutException at load the first cited page.") self.move_to_paper_initial_page(driver, initial_url) return citeds_str, cited_papers, cited_urls except NoSuchElementException: self.log.warning( "caught NoSuchElementException at load the first cited page.") self.move_to_paper_initial_page(driver, initial_url) return citeds_str, cited_papers, cited_urls self.log.debug("Wait Finished.") #self.save_current_page(driver, "./samples/sample_page_1055638_cited.html") #self.save_current_page(driver, "./samples/sample_page_1055638_cited.png") self.log.debug("continue pushing more view button") elements = self.continuous_pushing_more_view_button(driver, timeout) self.log.debug("create arrays of paper and url") for el in elements: cited_url = self.conf.getconf( "IEEE_website" ) + el.find_element_by_css_selector( 'div[class="ref-links-container stats-citations-links-container"] > span > a' ).get_attribute("ng-href") cited_urls.append(cited_url) cited_authors, cited_title, cited_conference, cited_date = self.parse_citing( el.find_element_by_css_selector( 'div[ng-bind-html="::item.displayText"]').text) timestamp = time.strftime('%Y-%m-%d %H:%M:%S') cited_paper = table_papers.Table_papers( title=cited_title, authors=cited_authors, conference=cited_conference, published=cited_date, url=cited_url, timestamp=timestamp) self.log.debug(cited_paper.get_vars()) cited_paper.renewal_insert() #self.save_current_page(driver, "./samples/sample_page_cited_view_more.html") #self.save_current_page(driver, "./samples/sample_page_cited_view_more.png") self.move_to_paper_initial_page(driver, initial_url) #self.save_current_page(driver, "./samples/sample_page_initial.html") #self.save_current_page(driver, "./samples/sample_page_initial.png") self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") return citeds_str, cited_papers, cited_urls def continuous_pushing_more_view_button(self, driver, timeout=30): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") ##if not cited, load-more-button does not exist. ##but if cited, load-more-button always exists nevertheless no more paper, ##and the buttons are hidden. elements = driver.find_elements_by_css_selector( 'div[ng-repeat="item in vm.contextData.paperCitations.ieee"] > div[class="pure-g pushTop10"] > div[class="pure-u-23-24"]' ) num_of_viewing = len(elements) limit_of_view = self.conf.getconf("IEEE_citation_num_at_first_page") self.log.debug("num_of_viewing[" + str(num_of_viewing) + "], limit_of_view[" + str(limit_of_view) + "]") while num_of_viewing > limit_of_view - 10: limit_of_view += self.conf.getconf( "IEEE_citation_num_per_more_view") try: load_more_button = driver.find_element_by_xpath( '//button[@class="load-more-button" and @ng-click="vm.loadMoreCitations(\'ieee\')"]' ) load_more_button.click() WebDriverWait( driver, timeout ).until(lambda driver: driver.find_element_by_xpath( '//button[@class="load-more-button" and @ng-click="vm.loadMoreCitations(\'ieee\')" and @aria-disabled="false"]' )) except TimeoutException: m = "caught TimeoutException at loading more cited pages(" + str( limit_of_view) + ") paper[" + driver.current_url + "]." print(m) self.log.warning(m) except NoSuchElementException: m = "caught NoSuchElementException at loading more cited pages(" + str( limit_of_view) + ") paper[" + driver.current_url + "]." print(m) self.log.warning(m) except ElementNotVisibleException: m = "caught ElementNotVisibleException at loading more cited pages(" + str( limit_of_view ) + ") paper[" + driver.current_url + "]. break." self.log.debug(m) break elements = driver.find_elements_by_css_selector( 'div[ng-repeat="item in vm.contextData.paperCitation"]') num_of_viewing = len(elements) self.log.debug("num_of_viewing[" + str(num_of_viewing) + "], limit_of_view[" + str(limit_of_view) + "]") self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") return elements def get_conference(self, driver): try: return driver.find_element_by_xpath('//div[@class="u-pb-1 stats-document-abstract-doi ng-scope"]')\ .find_element_by_tag_name('a').text except NoSuchElementException: return "" def get_date_of_publication(self, driver): #Date of Publication: 06 January 200 or Date of Conference 14-16 Nov. 2006 try: date = driver.find_element_by_xpath( '//div[@ng-if="::vm.details.isJournal == true"]').text return self.convert_date_of_publication_to_datetime(date) except NoSuchElementException: try: date = driver.find_element_by_xpath( '//div[@ng-if="::vm.details.isConference == true"]').text return self.convert_date_of_publication_to_datetime(date) except NoSuchElementException: self.log.debug("caught NoSuchElementException. date = None" ) ##todo get from paper?? driver.save_current_page( "./samples/caughtNoSuchElementExceptionatdate_of_publication.png" ) driver.save_current_page( "./samples/caughtNoSuchElementExceptionatdate_of_publication.html" ) return None def move_to_paper_initial_page(self, driver, initial_url, timeout=30): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") driver.get(initial_url) self.log.debug( "WebDriverWait(driver, timeout).until(lambda driver: driver.find_element_by_xpath('//div[@ng-repeat=\"article in vm.contextData.similar\"]'))" ) try: WebDriverWait( driver, timeout).until(lambda driver: driver.find_element_by_xpath( '//div[@ng-repeat="article in vm.contextData.similar"]')) except TimeoutException: self.log.warning( "caught TimeoutException at load the paper top page.") except NoSuchElementException: self.log.warning( "caught NoSuchElementException at load the paper top page.") self.log.debug("Wait Finished.") self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") def wait_button_to_pdf_page(self, driver, timeout=30): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") self.log.debug("Wait start.") try: WebDriverWait(driver, timeout).until( lambda driver: driver.find_element_by_css_selector( 'i[class="icon doc-act-icon-pdf"]')) except TimeoutException: self.log.warning( "caught TimeoutException at waiting button which go to pdf page." ) except NoSuchElementException: self.log.warning( "caught NoSuchElementException at waiting button which go to pdf page." ) self.log.debug("Wait Finished.") self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") def download_a_paper(self, driver, path="../../data/tmp/", filename="default", timeout=30): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") initial_url = driver.current_url m = "downloading paper to " + path + ". title[" + filename + "]" self.log.info(m) print(m) self.wait_button_to_pdf_page(driver, timeout) button = driver.find_element_by_css_selector( 'i[class="icon doc-act-icon-pdf"]') retries = 10 while retries > 0: try: button.click() self.log.debug("clicked button and no exception. break") break except (RemoteDisconnected, ConnectionRefusedError, URLError) as e: self.log.warning("caught " + e.__class__.__name__ + " at click download pdf button. retries[" + str(retries) + "]") self.log.warning(e, exc_info=True) time.sleep( self.conf.getconf("IEEE_wait_time_per_download_paper")) driver.reconnect(initial_url) self.wait_button_to_pdf_page(driver, timeout) button = driver.find_element_by_css_selector( 'i[class="icon doc-act-icon-pdf"]') retries -= 1 except NoSuchElementException: self.log.warning( "caught NoSuchElementException at click download pdf button. retries[" + str(retries) + "]") self.save_current_page( driver, "./samples/caught_NoSuchElementException_at_click_download_pdf_button.html" ) self.save_current_page( driver, "./samples/caught_NoSuchElementException_at_click_download_pdf_button.png" ) retries -= 1 if retries == 0: self.log.error("button.click() error") self.save_current_page(driver, "./samples/button_click_error.html") self.save_current_page(driver, "./samples/button_click_error.png") self.log.debug("Wait start.") try: WebDriverWait( driver, timeout).until(lambda driver: driver.find_element_by_xpath( '//frameset[@rows="65,35%"]/frame')) except TimeoutException: self.log.warning( "caught TimeoutException at load the iEEE pdf page.") self.log.warning("skip to download pdf. reuturn \"\"") driver.get(initial_url) return "" except NoSuchElementException: self.log.warning( "caught NoSuchElementException at load the iEEE pdf page.") self.log.warning("skip to download pdf. reuturn \"\"") driver.get(initial_url) return "" self.log.debug("Wait Finished.") url = driver.find_elements_by_xpath( '//frameset[@rows="65,35%"]/frame')[1].get_attribute("src") self.log.debug("url:" + url) if filename == "default": filename = url[:url.index("?")].split("/")[-1] filename = filename.replace(":", "") self.log.debug("filename:" + filename) command = "wget -p \"" + url + "\" -O \"" + path + filename + "\" > /dev/null 2>&1" #command = "wget -p \"" + url + "\" -O \"" + path + filename + "\" 1> /dev/null 2>&1" #command = "wget -p \"" + url + "\" -O \"" + path + filename + "\"" self.log.debug(command) try: self.log.debug(os.system(command)) except: self.log.warning("error at " + command) #self.save_current_page(driver, "./samples/7898372.png") #self.save_current_page(driver, "./samples/7898372.html") driver.get(initial_url) self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") self.log.debug("return[" + path + filename + "]") return path + filename def convert_date_of_publication_to_datetime(self, string): ##from ##Date of Publication: 06 January 2016 ##to ##2016-01-06 ##from ##Date of Conference: 14-16 Nov. 2006 ##to ##2006-11-14 ##from ##Date of Conference: 27 June-2 July 2016 ##to ##2016-06-27 ##from ##Date of Publication: N/A 2016 ##to ##2016-01-01 self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") self.log.debug("string: " + string) date = "" month = "" year = "" string = string.replace("\n", "") tmp = string.split(":") if len(tmp) != 2: self.log.warning("len(tmp) != 2") self.log.warning("string:" + string) return None date_month_year = tmp[1].lstrip() self.log.debug("date_month_year[" + date_month_year + "]") tmp2 = date_month_year.split("-") if len(tmp2) >= 3: self.log.warning("len(tmp2) >= 3") self.log.warning("string:" + string) return None elif len(tmp2) == 2: if re.match("^\d{1,2}$", tmp2[0]): date = tmp2[0] elif re.match("^\d{1,2}\s[a-zA-Z]", tmp2[0]): tmp3 = tmp2[0].split(" ") date = tmp3[0] month = tmp3[1].replace(".", "") tmp4 = date_month_year.split(" ") if len(tmp4) < 3: self.log.debug("only year") self.log.debug("string:" + string) date = "1" month = "Jan" if date == "": date = tmp4[-3] if month == "": month = tmp4[-2].replace(".", "") if year == "": year = tmp4[-1] import datetime try: month = str(datetime.datetime.strptime(month, '%B').month) except ValueError: try: month = str(datetime.datetime.strptime(month, '%b').month) except ValueError: if month == "Sept": month = "9" else: self.log.warning("ValueError") self.log.warning("string:" + string) self.log.warning("month = 0") month = "0" self.log.debug("year[" + year + "], month[" + month + "], date[" + date + "]") timestamp = datetime.date(int(year), int(month), int(date)) return timestamp def convert_paper_url_to_cited_url(self, url): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") #from #http://ieeexplore.ieee.org/document/4116687/?reload=true #to #http://ieeexplore.ieee.org/document/4116687/citations?anchor=anchor-paper-citations-ieee&ctx=citations self.log.debug("url[" + url + "]") converted_url = url.split("?")[ 0] + "citations?anchor=anchor-paper-citations-ieee&ctx=citations" self.log.debug("converted_url[" + converted_url + "]") return converted_url def convert_paper_url_to_pdf_url(self, url): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") ##from ##http://ieeexplore.ieee.org/document/6324382/ ##to ##http://ieeexplore.ieee.org/ielx7/35/7901458/07901477.pdf?tp=&arnumber=7901477&isnumber=7901458 print("url[" + url + "]") def parse_citing(self, strings): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") self.log.debug("src_str[" + strings + "]") #from #Daniel Garant, Wei Lu, "Mining Botnet Behaviors on the Large-Scale Web Application Community", Advanced Information Networking and Applications Workshops (WAINA) 2013 27th International Conference on, pp. 185-190, 2013. #to #Daniel Garant, Wei Lu, #Mining Botnet Behaviors on the Large-Scale Web Application Community #Advanced Information Networking and Applications Workshops (WAINA) 2013 27th International Conference on #pp. 185-190, 2013 array = strings.split("\"") if len(array) < 3: self.log.warning(__class__.__name__ + "." + sys._getframe().f_code.co_name + " warning") self.log.warning("strings[" + strings + "]") self.log.warning("len(array)(" + str(len(array)) + ") < 3. return \"\", \"\", \"\", \"\"") return "", "", "", "" authors = array[0] title = array[1] new_array = array[2][1:].split(",") self.log.debug("new_array:" + str(new_array)) self.log.debug(len(new_array)) if len(new_array) < 3: self.log.warning(__class__.__name__ + "." + sys._getframe().f_code.co_name + " warning") self.log.warning("strings[" + strings + "]") self.log.warning("len(new_array)(" + str(len(new_array)) + ") < 3. return authors, title, \"\", \"\"") return authors, title, "", "" elif len(new_array) == 3: conference, page, year = new_array elif len(new_array) == 4: conference, vol, page, year = new_array elif len(new_array) == 5: conference, vol, page, year, issn = new_array else: self.log.warning(__class__.__name__ + "." + sys._getframe().f_code.co_name + " warning") self.log.warning("strings[" + strings + "]") self.log.warning("len(new_array)(" + str(len(new_array)) + ") > 5. return authors, title, \"\", \"\"") return authors, title, "", "" #self.log.debug("re.match(\"\d*\", " + year + ")") #year = re.match("*\d*",year).group() + "-01-01 00:00:00" #year += "-01-01 00:00:00" self.log.debug("citing year is none") year = None self.log.debug("authors[" + str(authors) + "], title[" + str(title) + "], conference[" + str(conference) + "], year[" + str(year) + "]") return authors, title, conference, year def reconnect_driver(self, driver, url): self.log.debug("driver reconnect") import signal driver.service.process.send_signal( signal.SIGTERM) # kill the specific phantomjs child proc driver.quit() # quit the node proc driver = self.create_driver(url) return driver ## for debug def print_h2_attributes(self, driver): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") links = driver.find_elements_by_tag_name("h2") for link in links: print(link.text) self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") def save_current_page(self, driver, filename): self.log.warning(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") self.log.warning("this method will be removed.") self.log.warning("please use driver.save_current_page(filename)") path, suffix = os.path.splitext(filename) self.log.debug("path[" + path + "], suffix[" + suffix + "]") if suffix == ".html": f = open(filename, 'w') f.write(driver.page_source) f.close() elif suffix == ".png": driver.save_screenshot(filename) else: self.log.error("TYPEERROR suffix[" + suffix + "]") self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") def show_options(self): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") self.opt.show_options() self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished")
class Table_papers(Base): __tablename__ = 'papers' id = Column("id", INTEGER, primary_key=True) title = Column("title", TEXT) authors = Column("authors", TEXT) keywords = Column("keywords", TEXT) citings = Column("citings", MEDIUMTEXT) citeds = Column("citeds", MEDIUMTEXT) conference = Column("conference", TINYTEXT) published = Column("published", DATE) url = Column("url", TINYTEXT) abstract_path = Column("abstract_path", TEXT) pdf_path = Column("pdf_path", TEXT) timestamp = Column("timestamp", DATETIME) label = Column("label", TINYTEXT) color = Column("color", TINYTEXT) def __init__(self, id="", title="", authors="", keywords="", citings="", citeds="", conference="", published="", url="", timestamp="", abstract_path="", pdf_path="", label="", color=""): sys.path.append( os.path.dirname(os.path.abspath(__file__)) + "/../lib/utils") from conf import Conf self.conf = Conf() from log import Log as l self.log = l.getLogger() self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") import mysql_operator self.db = mysql_operator.Mysql_operator() self.id = id self.title = title self.authors = authors self.keywords = keywords self.citings = citings self.citeds = citeds self.conference = conference if published == "": self.published = None else: self.published = published self.url = url if timestamp == "": self.timestamp = None else: self.timestamp = timestamp self.abstract_path = abstract_path self.pdf_path = pdf_path self.label = label self.color = color def __repr__(self): return 'Table_papers' def insert(self): if self.id == "": self.id = self.get_id() vars_to_encode = [ "title", "authors", "keywords", "abstract_path", "pdf_path" ] for var in vars_to_encode: if eval("self." + var) is not None: exec("self." + var + " = self." + var + ".encode('utf-8', 'replace')") self.db.insert(self) for var in vars_to_encode: if eval("self." + var) is not None: exec("self." + var + " = self." + var + ".decode('utf-8', 'replace')") self.db.session.expunge(self) self.close() def has_already_downloaded(self): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") self.log.debug("paper.title[" + self.title + "]") if self.conf.getconf("IEEE_paper_download_period") <= 0: self.log.debug("IEEE_paper_download_period <= 0, return False") return False records = self.db.session.query(__class__).filter( __class__.title == self.title.encode('utf-8', 'replace')).all() if len(records) == 0: self.log.debug("This paper doesnt exist in db. return false") return False elif len(records) >= 2: self.log.warning("need to merge records") self.log.warning("title[" + self.title + "], len(records)[" + str(len(records)) + "]") self.log.debug("This paper exist in db. Number of records is [" + str(len(records)) + "]") if records[0].abstract_path == "": self.log.debug("but the abstract not downloaded. return False") return False self.log.debug( "and the abstract already downloaded. compare timestamps") limit = datetime.datetime.now() - timedelta( days=self.conf.getconf("IEEE_paper_download_period")) self.log.debug("limit[" + str(limit) + "], records[" + str(records[0].timestamp) + "]") if limit > records[0].timestamp: self.log.debug("should renew db. return false") return False else: self.log.debug("recently downloaded. clone paper and return true") clone_vars = [ "authors", "keywords", "citings", "citeds", "conference", "published", "url", "timestamp", "abstract_path", "pdf_path", "label", "color" ] for var in clone_vars: exec("self." + var + "= records[0]." + var) self.close() return True self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") def renewal_insert(self): self.log.info(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") #check duplication and insert records = self.db.session.query(__class__).filter( __class__.title == self.title.encode('utf-8')).all() if len(records) == 0: #new record self.insert() return 0 merge_id_list = [] for record in records: merge_id_list.append(record.id) vars = [ "authors", "keywords", "citings", "citeds", "conference", "published", "url", "abstract_path", "pdf_path", "label", "color" ] for var in vars: for record in records: self.log.debug("record.id[" + str(record.id) + "]") self.log.debug("var[" + var + "], self[" + str(eval("self." + var)) + "], record[" + str(eval("record." + var)) + "]") tmp_timestamp = self.timestamp if eval("record." + var) == None or eval("record." + var) == "": self.log.debug("records." + var + " == None") elif eval("self." + var) == None or eval("self." + var) == "": self.log.debug("self." + var + " == None") #tmp = eval("self." + var) #tmp = eval("record." + var) exec("self." + var + " = record." + var) self.log.debug("->var[" + var + "], self[" + str(eval("self." + var)) + "], record[" + str(eval("record." + var)) + "]") tmp_timestamp = record.timestamp else: self.log.debug(var + " is not none. compare timestamps") ## todo: check type(timestamp) if tmp_timestamp == None or record.timestamp == None or self.compare_timestamps( old=tmp_timestamp, new=record.timestamp): ##if record.timestamp is newer exec("self." + var + " = record." + var) self.log.debug("->var[" + var + "], self[" + str(eval("self." + var)) + "], record[" + str(eval("record." + var)) + "]") tmp_timestamp = record.timestamp #except: #m = "caught exception at tmp_timestamp[" + str(tmp_timestamp) + "] < record.timestamp[" + str(record.timestamp) + "]" #self.log.warning(m) #print(m) for record in records: self.db.delete(record) self.id = self.get_id() import time self.timestamp = time.strftime('%Y-%m-%d %H:%M:%S') self.insert() ##merge citations self.log.debug("merge[" + str(merge_id_list) + "] to self.id[" + str(self.id) + "]") for merge_id in merge_id_list: from table_citations import Table_citations from sqlalchemy import and_, or_ merge_records = self.db.session.query(Table_citations).filter( or_(Table_citations.start == merge_id, Table_citations.end == merge_id)).all() self.log.debug("id[" + str(merge_id) + "].records[" + str(len(merge_records)) + "]") for merge_record in merge_records: self.merge_citations(merge_record, merge_id_list, survival_id=self.id, delete_id=merge_id) self.close() def merge_citations(self, merge_record, merge_id_list, survival_id, delete_id): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") self.log.debug("from[" + str(merge_record.start) + "]to[" + str(merge_record.end) + "]") self.log.debug("survival_id[" + str(survival_id) + "], delete_id[" + str(delete_id) + "]") from table_citations import Table_citations ##is delete_id start or end? if merge_record.start in merge_id_list and merge_record.end in merge_id_list: self.log.debug("start[" + str(merge_record.start) + "] and end[" + str(merge_record.end) + "] are merge_id. delete.") self.log.debug("delete(merge_record)") self.db.delete(merge_record) #elif merge_record.start == delete_id and not merge_record.end in merge_id_list: elif merge_record.start == delete_id: self.log.debug("start[" + str(delete_id) + "] is delete_id. end[" + str(merge_record.end) + "]") self.log.debug("delete(merge_record)") self.db.delete(merge_record) citation = Table_citations(start=survival_id, end=merge_record.end) citation.renewal_insert() citation.close() #elif merge_record.end == delete_id and not merge_record.end in merge_id_list: elif merge_record.end == delete_id: self.log.debug("end[" + str(merge_record.end) + "] is delete_id. start[" + str(merge_record.start) + "]") citation = Table_citations(start=merge_record.start, end=survival_id) self.log.debug("delete(merge_record)") self.db.delete(merge_record) citation.renewal_insert() citation.close() def compare_timestamps(self, old, new): self.log.debug("compare old_timestamp[" + str(old) + "] < new[" + str(new) + "]?") old_str = str(old) new_str = str(new) if old_str < new_str: self.log.debug("return true") return True else: self.log.debug("return false") return False def get_citings_array(self): return self.citings.split(",") def get_citeds_array(self): return self.citeds.split(",") def get_id(self): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") ##when the records which have same title exist, ##the id is smallest one of records. records = self.db.session.query(__class__).filter( __class__.title == self.title.encode('utf-8')).all() if len(records) == 0: #new record return self._get_available_id() id = records[0].id for record in records: if id > record.id: id = record.id return id def _get_available_id(self): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") previous_id = 0 for q in self.db.session.query(__class__).order_by(__class__.id): if q.id - previous_id >= 2: self.log.debug("id[" + str(q.id) + "] - previous_id[" + str(previous_id) + "] > 2. return " + str(previous_id + 1)) return previous_id + 1 previous_id = q.id self.log.debug("for loop ended. return " + str(previous_id + 1)) return previous_id + 1 def close(self): self.db.session.close() self.db.close() def get_vars(self): return ("{" + "id: " + str(self.id) + ", " + "title: " + self.title + ", " + "authors: " + self.authors + ", " + "keywords: " + self.keywords + ", " + "citings: " + self.citings + ", " + "citeds: " + self.citeds + ", " + "conference: " + self.conference + ", " + "published: " + str(self.published) + ", " + "url: " + self.url + ", " + "timestamp: " + str(self.timestamp) + ", " + "abstract_path: " + self.abstract_path + ", " + "pdf_path: " + self.pdf_path + ", " + "label: " + self.label + ", " + "color: " + self.color + ", " + "}")
class IEEEXplore: def __init__(self): sys.path.append( os.path.dirname(os.path.abspath(__file__)) + "/../lib/utils") from conf import Conf self.conf = Conf() from log import Log as l self.log = l.getLogger() self.opt = Search_options() self.log.debug("class " + __class__.__name__ + " created.") def get_papers_of_new_conferences(self, conference_num): self.log.info(__class__.__name__ + "." + sys._getframe().f_code.co_name + "(conference_num=" + str(conference_num) + ") start.") def get_papers_by_keywords(self, keywords, num_of_papers): self.log.info(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") self.log.info("keywords[" + keywords + "], num_of_papers[" + str(num_of_papers) + "]") driver = self.create_driver() self.search_by_keywords(driver, keywords) urls = self.get_urls_of_papers_in_keywords_page(driver, num_of_papers) all_papers = [] all_citing_urls = [] all_cited_urls = [] """ for url in urls: driver.get(url) paper, citing_urls, cited_urls = self.get_attributes_and_download_pdf(driver) all_papers.append(paper) all_citing_urls.append(citing_urls) all_cited_urls = (cited_urls) self.log.info(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") """ return all_papers, all_cited_urls, all_citing_urls def get_papers_of_target_conference(self, conference_name): pass def create_driver(self, top_page=""): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") phantomjs_path = self.conf.getconf("phantomJS_pass") if top_page == "": top_page = self.conf.getconf("IEEE_top_page") from selenium import webdriver driver = webdriver.PhantomJS(phantomjs_path) self.log.debug("driver.get(" + top_page + ")") driver.get(top_page) self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished. return driver") return driver def search_by_keywords(self, driver, keywords): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") try: driver.find_element_by_name('queryText').send_keys(keywords) driver.find_element_by_class_name('Search-submit').click() except (Exception) as e: self.log.exception('[[EXCEPTON OCCURED]]: %s', e) sys.exit("[[EXCEPTON OCCURED]]please check logfile.") self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") def set_options(self): pass def get_urls_of_papers_in_keywords_page(self, driver, num_of_papers): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") urls = [] links = driver.find_elements_by_class_name("pure-u-22-24") self.log.debug("len(links)[" + str(len(links)) + "]") for link in links: element = link.find_element_by_css_selector("h2 > a") urls.append(element.get_attribute("href")) self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished. return " + str(urls)) return urls def get_attributes_and_download_pdf(self, search, driver): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") sys.path.append( os.path.dirname(os.path.abspath(__file__)) + "/../../lib/db") timeout = 30 target_paper_url = search.node self.log.info("url[" + target_paper_url + "], times[" + str(search.times) + "], limit[" + str(search.limit) + "]") #if this paper already downloaded, thid paper visited and skip. driver.get(target_paper_url) self.log.debug( "WebDriverWait(driver, timeout).until(lambda driver: driver.find_element_by_xpath('//div[@ng-repeat=\"article in vm.contextData.similar\"]'))" ) try: WebDriverWait( driver, timeout).until(lambda driver: driver.find_element_by_xpath( '//div[@ng-repeat="article in vm.contextData.similar"]')) except TimeoutException: m = "caught TimeoutException at load the paper top page." print(m) self.log.warning(m) except NoSuchElementException: m = "caught NoSuchElementException at load the paper top page." print(m) self.log.warning(m) self.log.debug("Wait Finished.") import table_papers paper = table_papers.Table_papers() self.log.debug("get attributes of this paper") #paper.title = self.get_title(driver) #paper.authors = self.get_authors(driver) #paper.keywords = self.get_keywords(driver) #citing_urls = [] paper.citings, citing_papers, citing_urls = self.get_citing_papers( driver, timeout) cited_urls = [] #paper.citeds, cited_papers, cited_urls = self.get_cited_papers(driver, timeout) #paper.conference = self.get_conference(driver) #paper.published = self.get_date_of_publication(driver) #paper.url = target_paper_url import time paper.timestamp = time.strftime('%Y-%m-%d %H:%M:%S') ##path #paper.renewal_insert() print(paper.get_vars()) self.log.debug("insert citations of this paper to db") import table_citations for citing_paper in citing_papers: citation = table_citations.Table_citations(start=paper.id, end=citing_paper.id) citation.renewal_insert() self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") return paper, citing_urls, cited_urls def get_title(self, driver): #element = driver.find_element_by_tag_name("title") #element = driver.find_element_by_id("title") #element = driver.find_element_by_css_selector("html > title") #element = driver.find_element_by_class_name("title") return driver.title def get_authors(self, driver): ##authors #<span ng-bind-html="::author.name" class="ng-binding"> #elements = driver.find_elements_by_class_name("authors-container") #print(str(len(elements))) authors_str = "" elements = driver.find_elements_by_xpath( '//span[@ng-bind-html="::author.name"]') #print(str(len(elements))) #5 for el in elements: authors_str += "," + el.text return authors_str[1:] def get_keywords(self, driver): ##keywords keywords_str = "" elements = driver.find_elements_by_xpath('//a[@ng-bind-html="::term"]') #print(str(len(elements))) #21 for el in elements: keyword = el.text if keyword in keywords_str: ##todo internet concludes int self.log.debug("keyword[" + keyword + "] is deplicated. not add.") else: keywords_str += "," + el.text return keywords_str def get_citing_papers(self, driver, timeout=30): ##citing_papers ##citing_urls """ <a ng-href="/document/4116687" title="Usilng Machine Learning Technliques to Identify Botnet Traffic" target="_self" href="/document/4116687"> <span ng-bind-html="::(vm.contextData.isStandard ? article.standardNumber + ' - ' + article.title : article.title) | charlimitHtml:185" mathjax-bind="" class="ng-binding">Usilng Machine Learning Technliques to Identify Botnet Traffic</span> </a> <div class="ng-binding">Carl Livadas; Robert Walsh; David Lapsley; W. Timothy Strayer</div> </div><!-- end ngRepeat: article in vm.contextData.similar --><div class="doc-all-related-articles-list-item ng-scope" ng-repeat="article in vm.contextData.similar"> """ import table_papers citings_str = "" citing_papers = [] citing_urls = [] self.log.debug( "WebDriverWait(driver, timeout).until(lambda driver: driver.find_elements_by_css_selector('div[ng-repeat=\"article in vm.contextData.similar\"] > a')) start" ) try: WebDriverWait(driver, timeout).until( lambda driver: driver.find_elements_by_css_selector( 'div[ng-repeat="article in vm.contextData.similar"] > a')) except TimeoutException: m = "caught TimeoutException at load the paper top page." print(m) self.log.warning(m) return citings_str, citing_papers, citing_urls except NoSuchElementException: m = "caught NoSuchElementException at load the paper top page." print(m) self.log.warning(m) return citings_str, citing_papers, citing_urls self.log.debug("Wait Finished.") elements = driver.find_elements_by_css_selector( 'div[ng-repeat="article in vm.contextData.similar"]') print(str(len(elements))) self.save_current_page(driver, "./samples/sample_page_4116687_start.html") self.save_current_page(driver, "./samples/sample_page_4116687_start.png") print("create arrays of paper and url") for el in elements: citing_paper = table_papers.Table_papers() citing_paper.url = self.conf.getconf( "IEEE_website") + el.find_element_by_css_selector( 'a').get_attribute("ng-href") citing_paper.title = el.find_element_by_css_selector( 'a').get_attribute("title") citing_paper.authors = el.find_element_by_css_selector( 'div[class="ng-binding"]').text.replace(";", ",") import time timestamp = time.strftime('%Y-%m-%d %H:%M:%S') print("citing_url[" + citing_paper.url + "]") print("citing_title[" + citing_paper.title + "]") print("citing_authors[" + citing_paper.authors + "]") print(citing_paper.get_vars()) citing_paper.renewal_insert() citing_papers.append(citing_paper) citing_urls.append(citing_paper.url) return citings_str, citing_papers, citing_urls def get_cited_papers(self, driver, timeout=30): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") import table_papers citeds_str = "" cited_papers = [] cited_urls = [] #href="/document/4116687/citations?tabFilter=papers" #http://ieeexplore.ieee.org/document/4116687/citations?anchor=anchor-paper-citations-ieee&ctx=citations initial_url = driver.current_url driver.get(self.convert_paper_url_to_cited_url(initial_url)) self.save_current_page(driver, "./samples/sample_page_1055638_start.html") self.save_current_page(driver, "./samples/sample_page_1055638_start.png") """ <div ng-if="!vm.loading && !vm.details.paperCitations.ieee && !vm.details.paperCitations.nonIeee && !vm.details.patentCitations" class="ng-scope" style=""> Citations are not available for this document. </div> """ #el = driver.find_element_by_xpath('//div[@ng-if="!vm.loading && !vm.details.paperCitations.ieee && !vm.details.paperCitations.nonIeee && !vm.details.patentCitations"') #els = driver.find_elements_by_xpath('//div[@class="ng-scope" and @style=""]') #ok. got els #els = driver.find_elements_by_xpath('//div[@ng-if="::!vm.contextData.paperCitations.ieee && !vm.contextData.paperCitations.nonIeee && !vm.contextData.patentCitations"]') #0 #><div ng-if="::!vm.contextData.paperCitations.ieee && !vm.contextData.paperCitations.nonIeee && !vm.contextData.patentCitations" class="ng-scope"> #els = driver.find_elements_by_xpath('//div[@ng-if="::!vm.contextData.paperCitations.ieee"]') #0 try: div = driver.find_element_by_css_selector( 'div > section[class="document-all-references ng-scope"] > div[class="ng-scope"] > div[class="strong"]' ).text if div == "Citations not available for this document.": self.log.debug("this paper not cited. return []") return citeds_str, cited_papers, cited_urls self.log.debug("div=" + div + ", this paper is cited") except NoSuchElementException: self.log.debug("this paper is cited") """ try: driver.find_element_by_name('queryText').send_keys(keywords) driver.find_element_by_class_name('Search-submit').click() except(Exception) as e: self.log.exception('[[EXCEPTON OCCURED]]: %s', e) sys.exit("[[EXCEPTON OCCURED]]please check logfile.") document-banner-metric ng-scope ui-sref="document.full({tab:'citations', q:null, ctx:null, section:null, part:null, anchor:null, tabFilter: 'papers'})" #self.save_current_page(driver, "./samples/sample_page2.html") self.save_current_page(driver, "./samples/sample_page2.png") <button class="load-more-button" type="button" ng-click="vm.loadMoreCitations('patent')" ng-disabled="vm.loading" tabindex="0" aria-disabled="false"> <span ng-show="!vm.loading" aria-hidden="false" class="">View More</span> <i class="fa fa-spinner fa-spin ng-hide" ng-show="vm.loading" aria-hidden="true"></i> """ self.log.debug( "WebDriverWait(driver, timeout).until(lambda driver: driver.find_element_by_xpath('//b[@class=ng-binding]' start" ) try: WebDriverWait( driver, timeout).until(lambda driver: driver.find_element_by_xpath( '//b[@class="ng-binding"]')) except TimeoutException: m = "caught TimeoutException at load the first cited page." print(m) self.log.warning(m) driver.get(initial_url) return citeds_str, cited_papers, cited_urls except NoSuchElementException: m = "caught NoSuchElementException at load the first cited page." print(m) self.log.warning(m) driver.get(initial_url) return citeds_str, cited_papers, cited_urls self.log.debug("Wait Finished.") #self.save_current_page(driver, "./samples/sample_page_1055638_cited.html") #self.save_current_page(driver, "./samples/sample_page_1055638_cited.png") elements = driver.find_elements_by_css_selector( 'div[ng-repeat="item in vm.contextData.paperCitations.ieee"] > div[class="pure-g pushTop10"] > div[class="pure-u-23-24"]' ) num_of_viewing = len(elements) limit_of_view = self.conf.getconf("IEEE_citation_num_at_first_page") print("num_of_viewing[" + str(num_of_viewing) + "], limit_of_view[" + str(limit_of_view) + "]") print("continue pushing more view button") ##if not cited, load-more-button does not exist. ##but if cited, load-more-button always exists nevertheless no more paper, ##and the buttons are hidden. while num_of_viewing > limit_of_view - 10: limit_of_view += self.conf.getconf( "IEEE_citation_num_per_more_view") load_more_button = driver.find_element_by_xpath( '//button[@class="load-more-button" and @ng-click="vm.loadMoreCitations(\'ieee\')"]' ) load_more_button.click() try: WebDriverWait( driver, timeout ).until(lambda driver: driver.find_element_by_xpath( '//button[@class="load-more-button" and @ng-click="vm.loadMoreCitations(\'ieee\')" and @aria-disabled="false"]' )) except TimeoutException: m = "caught TimeoutException at loading more cited pages(" + str( limit_of_view) + ") paper[" + driver.current_url + "]." print(m) self.log.warning(m) except NoSuchElementException: m = "caught NoSuchElementException at loading more cited pages(" + str( limit_of_view) + ") paper[" + driver.current_url + "]." print(m) self.log.warning(m) #elements = driver.find_elements_by_css_selector('div[ng-repeat="item in vm.contextData.paperCitations.ieee"] > div > div > b[class="ng-binding"]') elements = driver.find_elements_by_css_selector( 'div[ng-repeat="item in vm.contextData.paperCitations.ieee"] > div[class="pure-g pushTop10"] > div[class="pure-u-23-24"]' ) #self.save_current_page(driver, "./samples/sample_page_1055638_cited_"+str(limit_of_view)+".html") #self.save_current_page(driver, "./samples/sample_page_1055638_cited_"+str(limit_of_view)+".png") num_of_viewing = len(elements) print("num_of_viewing[" + str(num_of_viewing) + "], limit_of_view[" + str(limit_of_view) + "]") print("cited loop finished. num_of_viewing[" + str(num_of_viewing) + "], limit_of_view[" + str(limit_of_view) + "]") print("create arrays of paper and url") for el in elements: cited_url = self.conf.getconf( "IEEE_website" ) + el.find_element_by_css_selector( 'div[class="ref-links-container stats-citations-links-container"] > span > a' ).get_attribute("ng-href") cited_urls.append(cited_url) cited_authors, cited_title, cited_conference, cited_date = self.parse_citing( el.find_element_by_css_selector( 'div[ng-bind-html="::item.displayText"]').text) import time timestamp = time.strftime('%Y-%m-%d %H:%M:%S') cited_paper = table_papers.Table_papers( title=cited_title, authors=cited_authors, conference=cited_conference, published=cited_date, url=cited_url, timestamp=timestamp) print(cited_paper.get_vars()) cited_paper.renewal_insert() #self.save_current_page(driver, "./samples/sample_page_1055638_cited_view_more.html") #self.save_current_page(driver, "./samples/sample_page_1055638_cited_view_more.png") driver.get(initial_url) self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") return citeds_str, cited_papers, cited_urls def get_conference(self, driver): conference = driver.find_element_by_xpath('//div[@class="u-pb-1 stats-document-abstract-doi ng-scope"]')\ .find_element_by_tag_name('a').text return conference def get_date_of_publication(self, driver): #Date of Publication: 06 January 200 or Date of Conference 14-16 Nov. 2006 try: date = driver.find_element_by_xpath( '//div[@ng-if="::vm.details.isConference == true"]').text except NoSuchElementException: self.log.debug( "catch NoSuchElementException. date = ''") ##todo paper date = "" return self.convert_to_datetime(date) def download_a_paper(self, driver, path="../../data/tmp/"): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") initial_url = driver.current_url button = driver.find_element_by_css_selector( 'i[class="icon doc-act-icon-pdf]') button.click() self.save_current_page(driver, "./samples/sample_page_7849067_pdf_click.html") self.save_current_page(driver, "./samples/sample_page_7849067_pdf_click.png") driver.get(initial_url) self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") def download_papers_by_keywords(self, driver, path, download_num=25): # 0:デスクトップ、1:システム規定のフォルファ、2:ユーザ定義フォルダ #driver.setPreference("browser.download.folderList",2) # 上記で2を選択したのでファイルのダウンロード場所を指定 #driver.setPreference("browser.download.dir", path) # ダウンロード完了時にダウンロードマネージャウィンドウを表示するかどうかを示す真偽値。 #driver.setPreference("browser.download.manager.showWhenStarting",False) links = driver.find_elements_by_class_name("pure-u-22-24") self.log.debug("len(links)[" + str(len(links)) + "]") i = 0 for link in links: self.log.debug("txt:" + link.text) element = link.find_element_by_css_selector("h2 > a") pdf_title = element.text self.log.debug("pdf_title:" + pdf_title) pdf_url = self.convert_path_to_url(element.get_attribute("href")) self.log.debug("pdf_dir:" + pdf_url) element = link.find_element_by_css_selector("p") pdf_authors = link.find_element_by_css_selector("p").text.split( "; ") self.log.debug("pdf_author:" + str(pdf_authors)) print("pdf_title:" + pdf_title) print("pdf_dir:" + pdf_url) print("pdf_author:" + str(pdf_authors)) i += 1 if i >= download_num: self.log.debug("i>=" + str(download_num) + "." + __class__.__name__ + "." + sys._getframe().f_code.co_name + " finished.") return 0 self.log.debug("len(link)<" + str(download_num) + "." + __class__.__name__ + "." + sys._getframe().f_code.co_name + " finished.") return 0 """ def get_papers_with_breadth_first_search(self, root_url_of_paper): import math math.breadth_first_search(root_url_of_paper, get_citing_papers() ) self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") self.log.debug("root_url_of_paper["+root_url_of_paper+"]") citing_urls, cited_urls = *** for url in citing_urls: self.get_papers_with_breadth_first_search(url) self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") """ def convert_to_datetime(self, str): self.log.warning("!!!incomplete method[" + __class__.__name__ + "." + sys._getframe().f_code.co_name + "]!!!") import time timestamp = time.strftime('%Y-%m-%d %H:%M:%S') return timestamp def convert_paper_url_to_cited_url(self, url): #from #http://ieeexplore.ieee.org/document/4116687/?reload=true #to #http://ieeexplore.ieee.org/document/4116687/citations?anchor=anchor-paper-citations-ieee&ctx=citations return url.split("?")[ 0] + "citations?anchor=anchor-paper-citations-ieee&ctx=citations" def parse_citing(self, str): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") self.log.debug("src_srt[" + str + "]") #from #Daniel Garant, Wei Lu, "Mining Botnet Behaviors on the Large-Scale Web Application Community", Advanced Information Networking and Applications Workshops (WAINA) 2013 27th International Conference on, pp. 185-190, 2013. #to #Daniel Garant, Wei Lu, #Mining Botnet Behaviors on the Large-Scale Web Application Community #Advanced Information Networking and Applications Workshops (WAINA) 2013 27th International Conference on #pp. 185-190, 2013 array = str.split("\"") if len(array) < 3: self.log.warning(__class__.__name__ + "." + sys._getframe().f_code.co_name + " warning") self.log.warning("str[" + str + "]") self.log.warning("len(array)(" + str(len(array)) + ") < 3. return \"\", \"\", \"\", \"\"") return "", "", "", "" authors = array[0] title = array[1] new_array = array[2][1:].split(",") print(len(new_array)) if len(new_array) < 3: self.log.warning(__class__.__name__ + "." + sys._getframe().f_code.co_name + " warning") self.log.warning("str[" + str + "]") self.log.warning("len(new_array)(" + str(len(new_array)) + ") < 3. return authors, title, \"\", \"\"") return authors, title, "", "" elif len(new_array) == 3: conference, page, year = new_array elif len(new_array) == 4: conference, vol, page, year = new_array elif len(new_array) == 5: conference, vol, page, year, issn = new_array else: self.log.warning(__class__.__name__ + "." + sys._getframe().f_code.co_name + " warning") self.log.warning("str[" + str + "]") self.log.warning("len(new_array)(" + str(len(new_array)) + ") > 5. return authors, title, \"\", \"\"") return authors, title, "", "" return authors, title, conference, year ## for debug def print_h2_attributes(self, driver): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") links = driver.find_elements_by_tag_name("h2") for link in links: print(link.text) self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") def save_current_page(self, driver, filename): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") path, suffix = os.path.splitext(filename) self.log.debug("path[" + path + "], suffix[" + suffix + "]") if suffix == ".html": f = open(filename, 'w') f.write(driver.page_source) f.close() elif suffix == ".png": driver.save_screenshot(filename) else: self.log.error("TYPEERROR suffix[" + suffix + "]") self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished") def show_options(self): self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start") self.opt.show_options() self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished")
def test_var(self): from conf import Conf print("IEEE_website["+Conf.getconf("IEEE_website")+"]") print("IEEE_top_page["+Conf.getconf("IEEE_top_page")+"]") paper_url = Conf.getconf("IEEE_website") + "/document/6550394" print("paper_url[" + paper_url + "]")
def test_conf(self): from conf import Conf print("loglevel["+Conf.getconf("loglevel")+"]")
import datetime sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../lib/utils") from conf import Conf from log import Log sys.path.append( os.path.dirname(os.path.abspath(__file__)) + "/../src/scraping") from kakaku import Kakaku log = Log.getLogger() kakaku = Kakaku() args = sys.argv[1:] print("products: " + str(args)) log_dir = Conf.getconf("product_log_dir") for arg in args: log.info("target product name[" + str(arg) + "]") product_name = arg log_name = str(datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + "_" + re.sub(" |/", "_", product_name) + ".log") #log_name = "product.log" #product_log = Log.getLogger(logfile=log_dir+log_name) product_log = log print("product: " + product_name + ", save log to: " + log_dir + log_name) try: kakaku.save_cheapest_pdf(product_name, logger=product_log) except Exception as e: print("Faild. caught " + e.__class__.__name__ + " exception. Please retry [" + product_name + "]") print(e)