Python Conf.getconf примеры использования

Язык программирования: Python

Пространство имен/Пакет: conf

Класс/Тип: Conf

Метод/Функция: getconf

Примеров на hotexamples.com: 17

Python Conf.getconf - 17 примеров найдено. Это лучшие примеры Python кода для conf.Conf.getconf, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Conf(30)

get(30)

getconf(17)

isWindows(15)

set(8)

savedir(6)

getAllversionInformation(3)

__init__(3)

test_chaos_theory(2)

ConfigSectionMap(2)

load(2)

restore_cache_dir(1)

is_agent(1)

is_server(1)

iter_elements(1)

overwrite(1)

read(1)

volsize(1)

restore_cache_size(1)

run_task(1)

train(1)

secretfile(1)

serverLoad(1)

serverSave(1)

setLanguage(1)

test(1)

s3_parallel_uploads(1)

get_staging_db_info(1)

has(1)

existence(1)

address(1)

as_view(1)

backup_skip_database(1)

backup_skip_files(1)

backup_skip_packages(1)

conf(1)

dump(1)

embed(1)

force_profile(1)

get_instance(1)

full_backup(1)

getConf(1)

getVal(1)

get_confs(1)

get_connection_string(1)

get_db_hostname(1)

get_db_name(1)

get_db_password(1)

get_db_user(1)

weatherServerPid(1)

Пример #1

Показать файл

Файл: log.py Проект: tilmitt11191/kakaku

    def getSparkLogger(cls, logfile="", conffile=""):
        if (logfile == ""):
            logfile = Conf.getconf("logdir", conffile=conffile) + Conf.getconf(
                "logfile", conffile=conffile)
        from pyspark import SparkContext
        #sc = SparkContext.getOrCreate()
        #log4j = sc._jvm.org.apache.log4j
        #logger = log4j.LogManager.getLogger("myLogger")
        #logger = log4j.LogManager.getLogger(__name__)
        #logger = log4j.LogManager.getRootLogger()
        #logger.appender.FILE.File="../../var/log/log"
        #logger.setLevel(log4jLogger.Level.DEBUG)
        #logger.info("aaaa")
        #return logger

        if not 'LOG_DIRS' in os.environ:
            sys.stderr.write(
                'Missing LOG_DIRS environment variable, pyspark logging disabled'
            )
            return

        file = os.environ['LOG_DIRS'].split(',')[0] + '/log'
        logging.basicConfig(
            filename=file,
            level=logging.INFO,
            format=
            '%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s'
        )
        logger = logging.getLogger()
        return logger

Пример #2

Показать файл

    def getLogger(cls, logfile="", conffile=""):
        from conf import Conf

        if (logfile == ""):
            logfile = Conf.getconf("logdir", conffile=conffile) + Conf.getconf(
                "logfile", conffile=conffile)
        loglevel = Conf.getconf("loglevel", conffile=conffile)
        rotate_log_size = Conf.getconf("rotate_log_size")

        import logging, logging.handlers
        logger = logging.getLogger()

        if len(logger.handlers) < 1:
            #fh = logging.FileHandler(filename="../../var/log/log2")
            #logger.addHandler(fh)
            rfh = logging.handlers.RotatingFileHandler(
                filename=logfile,
                maxBytes=rotate_log_size,
                backupCount=Conf.getconf("backup_log_count"))
            formatter = logging.Formatter(
                '%(asctime)s - %(levelname)s - %(message)s')
            rfh.setFormatter(formatter)
            logger.addHandler(rfh)

        #logging.basicConfig(filename="../../var/log/log2")

        id_ = id(logger)
        logger.setLevel(eval("logging." + loglevel))
        logger.debug(
            "return logger\n logfile[{logfile}]\n rotate_log_size[{rotate_log_size}]\n id[{id_}]"
            .format(**locals()))
        return logger

Пример #3

Показать файл

Файл: kakaku.py Проект: tilmitt11191/kakaku

 def __init__(self):
     self.log = Log.getLogger()
     self.driver = self.create_driver()
     self.top_url = Conf.getconf("kakaku_top_page")
     self.target_stores = Conf.getconf("target_stores")
     self.extract_store_name = re.compile(r"\'")
     self.warning_messages = False
     self.log.debug(__class__.__name__ + "." +
                    sys._getframe().f_code.co_name + " start")

Пример #4

Показать файл

Файл: kakaku.py Проект: tilmitt11191/kakaku

 def move_to_vendor_page(self, vendor_button):
     self.log.debug(__class__.__name__ + "." +
                    sys._getframe().f_code.co_name + " start.")
     self.driver.get(vendor_button.get_attribute("href"),
                     warning_messages=self.warning_messages)
     self.log.debug("wait start")
     for sec in range(Conf.getconf("phantomJS_load_timeout")):
         self.log.debug("wait redirect " + str(sec) + "[sec]")
         if self.driver.title:
             self.log.debug("move to shop page finished. page title: " +
                            self.driver.title)
             break
         time.sleep(Conf.getconf("vendor_page_wait_time"))
     self.log.debug(__class__.__name__ + "." +
                    sys._getframe().f_code.co_name + " finished.")

Пример #5

Показать файл

    def save_current_page(self, filename):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        path, suffix = os.path.splitext(filename)
        max_filename_length = Conf.getconf("max_filename_length")
        if len(path) > max_filename_length:
            self.log.debug("filename too long. convert from :" + filename)
            filename = path[:max_filename_length] + suffix
            self.log.debug("to :" + filename)

        self.log.debug("path[" + path + "], suffix[" + suffix + "]")
        if suffix == ".html":
            f = open(filename, 'w')
            f.write(self.page_source)
            f.close()
        elif suffix == ".png":
            self.save_screenshot(filename)
        elif suffix == ".pdf":
            pngname = os.path.splitext(filename)[0] + ".png"
            self.save_screenshot(pngname)
            self.convert_png_to_pdf(pngname)
        else:
            self.log.error(__class__.__name__ + "." +
                           sys._getframe().f_code.co_name)
            self.log.error("TYPEERROR suffix[" + suffix + "]")
        self.log.debug("saved to " + filename)
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

Пример #6

Показать файл

Файл: log.py Проект: tilmitt11191/kakaku

 def getLogger(cls, logfile="", conffile=""):
     exec_env = Conf.getconf("exec_env")
     if exec_env == "normal":
         return cls.getNormalLogger(logfile=logfile, conffile=conffile)
     elif exec_env == "spark":
         return cls.getSparkLogger(logfile=logfile, conffile=conffile)
     else:
         sys.exit("getLogger Type Error[" + str(exec_env) + "]")

Пример #7

Показать файл

Файл: log.py Проект: tilmitt11191/kakaku

    def getNormalLogger(cls, logfile="", conffile=""):
        if logfile == "":
            logfile = Conf.getconf("logdir", conffile=conffile) + Conf.getconf(
                "logfile", conffile=conffile)

        logger = logging.getLogger(logfile)
        if len(logger.handlers) > 1:  # called before and already created.
            return logger

        loglevel = Conf.getconf("loglevel", conffile=conffile)
        rotate_log_size = Conf.getconf("rotate_log_size")
        if len(logger.handlers) < 1:
            rfh = logging.handlers.RotatingFileHandler(
                filename=logfile,
                maxBytes=rotate_log_size,
                backupCount=Conf.getconf("backup_log_count"))
            formatter = logging.Formatter(
                '%(asctime)s - %(levelname)s - %(message)s')
            rfh.setFormatter(formatter)
            logger.addHandler(rfh)

            if Conf.getconf("loglevel_to_stdout", conffile=conffile):
                stream_handler = logging.StreamHandler()
                stream_handler.setFormatter(formatter)
                stream_handler.setLevel(\
                 Conf.getconf("loglevel_to_stdout", conffile=conffile))
                logger.addHandler(stream_handler)

        id_ = id(logger)
        logger.setLevel(eval("logging." + loglevel))
        logger.debug(
            "return normal logger\n logfile[{logfile}]\n rotate_log_size[{rotate_log_size}]\n id[{id_}]"
            .format(**locals()))
        return logger

Пример #8

Показать файл

Файл: mysql_operator.py Проект: tilmitt11191/paper_graph

	def establish_session(self):
		self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start")
		import sqlalchemy
		from conf import Conf
		self.engine = sqlalchemy.create_engine(Conf.getconf("myslq_url"), echo=False)
		from sqlalchemy.orm import sessionmaker
		Session = sessionmaker(bind=self.engine)
		session = Session()
		session.expire_on_commit = False
		self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished")
		return session

Пример #9

Показать файл

Файл: phantomjs_.py Проект: tilmitt11191/paper_graph

    def __init__(self, executable_path="",\
        port=0, desired_capabilities=DesiredCapabilities.PHANTOMJS,\
        service_args=None, service_log_path=None):
        self.executable_path = executable_path
        self.port = port
        self.PHANTOMJS = desired_capabilities
        self.service_args = service_args
        self.service_log_path = service_log_path

        self.log = Log.getLogger()
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")

        import logging, logging.handlers
        selenium_logger = logging.getLogger(
            'selenium.webdriver.remote.remote_connection')
        selenium_logger.setLevel(logging.ERROR)
        if len(selenium_logger.handlers) < 1:
            rfh = logging.handlers.RotatingFileHandler(
                filename=Conf.getconf("logdir") +
                Conf.getconf("phantomjs_logfile"),
                maxBytes=Conf.getconf("rotate_log_size"),
                backupCount=Conf.getconf("backup_log_count"))
            formatter = logging.Formatter(
                '%(asctime)s - %(levelname)s - %(message)s')
            rfh.setFormatter(formatter)
            selenium_logger.addHandler(rfh)

            stream_handler = logging.StreamHandler()
            stream_handler.setFormatter(formatter)
            stream_handler.setLevel(Conf.getconf("loglevel_to_stdout"))
            selenium_logger.addHandler(stream_handler)

        if self.executable_path == "":
            self.executable_path = Conf.getconf("phantomJS_pass")
        if self.service_args == None:
            self.service_args = ["--webdriver-loglevel=DEBUG"]
        if self.service_log_path == None:
            self.service_log_path = Conf.getconf("logdir") + Conf.getconf(
                "phantomjs_logfile")
        self.log.debug(__class__.__name__ + ".super().__init__ start")
        super().__init__(executable_path=self.executable_path, \
           port=self.port, desired_capabilities=self.PHANTOMJS, \
           service_args=self.service_args, service_log_path=self.service_log_path)
        self.set_page_load_timeout(Conf.getconf("phantomJS_load_timeout"))

Пример #10

Показать файл

Файл: kakaku.py Проект: tilmitt11191/kakaku

    def save_cheapest_pdf(self, product_name, logger=None):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start.")
        self.log.debug("product_name[" + product_name + "]")
        if logger:
            self.log.debug("change logger")
            self.log = logger
            self.driver.log = logger

        print("move to kakaku.com")
        self.move_to_top_page()

        print("search product. name[" + product_name + "]")
        search_results = self.search_product(product_name)
        if len(search_results) > 1:
            self.log.warn("search results of product_name[" +
                          str(product_name) + "] = " +
                          str(len(search_results)) + " > 1.")
            self.log.warn("use only first result.")

        print("click top of search result")
        if not self.driver.click(search_results[0],
                                 warning_messages=self.warning_messages):
            self.log.error("click failed. Please retry.")
            #exit(1)
            raise Exception
        #tag = '//td[@class="fRed"]/p[@class="wordwrapTrs"]/a'
        tag = '//p[@class="wordwrapShop"]/a'
        self.driver.wait_appearance_of_tag(by="xpath", tag=tag)

        print("get cheapest vendor")
        cheapest_vendor, vendor_name = self.get_cheapest_vendor_button(
            product_name)

        print("move_to_vendor_page")
        self.move_to_vendor_page(cheapest_vendor)
        path = Conf.getconf("pdf_save_path")
        print("save as " + path + "/" + product_name + "|" + vendor_name +
              ".pdf")
        self.driver.save_current_page(path + "/" + product_name + "|" +
                                      vendor_name + ".pdf")

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished.")

Пример #11

Показать файл

    def get(self,
            url,
            tag_to_wait="",
            by="xpath",
            timeout="default",
            warning_messages=True):
        if timeout == "default":
            timeout = self.load_timeout
        retries = 10
        while retries > 0:
            try:
                self.log.debug("super().get(" + url + ") start")
                super().get(url)
                break
            except RemoteDisconnected as e:
                self.log.debug("PhantomJS caught RemoteDisconnected at get" +
                               url)
                self.log.debug("%s", e)
                self.log.debug("retries[" + str(retries) + "]")
                super().__init__(executable_path=self.executable_path, \
                  port=self.port, desired_capabilities=self.PHANTOMJS, \
                  service_args=self.service_args, service_log_path=self.service_log_path, logger=self.log)
                retries -= 1
            except TimeoutException as e:
                self.save_error_messages_at(sys._getframe().f_code.co_name,
                                            "by[" + by + "], tag[" +
                                            tag_to_wait + "]",
                                            warning_messages,
                                            e,
                                            url=url)
                self.execute_script("window.stop();")

        if retries == 0:
            self.log.error("PhantomJS caught ERROR RemoteDisconnected at get" +
                           url)
            self.save_current_page("../../var/ss/get_error.html")
            self.save_current_page("../../var/ss/get_error.png")
        wait_time = Conf.getconf("phantomJS_wait_time_per_get")
        self.log.debug("get finished. wait " + str(wait_time) + " seconds")
        time.sleep(wait_time)
        if tag_to_wait != "":
            self.wait_appearance_of_tag(by=by,
                                        tag=tag_to_wait,
                                        timeout=timeout)

Пример #12

Показать файл

Файл: old.IEEEXplore.py Проект: tilmitt11191/paper_graph

class IEEEXplore:
    def __init__(self):
        sys.path.append(
            os.path.dirname(os.path.abspath(__file__)) + "/../lib/utils")
        from conf import Conf
        self.conf = Conf()
        from log import Log as l
        self.log = l.getLogger()

        self.opt = Search_options()
        self.log.debug("class " + __class__.__name__ + " created.")

    def get_papers_of_new_conferences(self, conference_num):
        self.log.info(__class__.__name__ + "." +
                      sys._getframe().f_code.co_name + "(conference_num=" +
                      str(conference_num) + ") start.")

    def get_papers_by_keywords(self,
                               keywords,
                               num_of_papers="all",
                               search_options="default",
                               path="../../data/tmp/",
                               filename="title",
                               timeout=30):
        self.log.info(__class__.__name__ + "." +
                      sys._getframe().f_code.co_name + " start")
        self.log.info("keywords[" + keywords + "], num_of_papers[" +
                      str(num_of_papers) + "]")

        driver = self.create_driver(timeout=timeout)
        if search_options == "default":
            search_options = Search_options()

        self.search_by_keywords(driver,
                                keywords,
                                search_options=search_options,
                                timeout=timeout)

        if num_of_papers == "all":
            element = driver.find_element_by_css_selector(
                'div[class="pure-u-1-1 Dashboard-header ng-scope"] > span')
            num_of_papers = int(element.text.split(" ")[-1].replace(",", ""))
        self.log.debug("num_of_papers[" + str(num_of_papers) + "]")

        urls = self.get_urls_of_papers_in_keywords_page(
            driver, search_options.PerPage, num_of_papers, timeout)
        print("urls.size[" + str(len(urls)) + "]")
        all_papers = []
        all_citing_urls = []
        all_cited_urls = []

        sys.path.append(
            os.path.dirname(os.path.abspath(__file__)) + "/../../lib/math")
        from searchs import Searchs
        search = Searchs(limit=num_of_papers)

        for url in urls:
            search.node = url
            paper, citing_urls, cited_urls = self.get_attributes_and_download_pdf(
                search, driver, path=path, filename=filename)
            print("paper.title[" + paper.title + "]")
            all_papers.append(paper)
            all_citing_urls.extend(citing_urls)
            all_cited_urls.extend(cited_urls)
            self.log.info(__class__.__name__ + "." +
                          sys._getframe().f_code.co_name + " finished")

        return all_papers, urls, all_citing_urls, all_cited_urls

    def get_papers_of_target_conference(self, conference_name):
        pass

    def create_driver(self, url="", timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        self.log.debug("url[" + url + "]")

        if url == "" or url == self.conf.getconf("IEEE_top_page"):
            url = self.conf.getconf("IEEE_top_page")

        sys.path.append(
            os.path.dirname(os.path.abspath(__file__)) + "/../../lib/scraping")
        from phantomjs_ import PhantomJS_
        driver = PhantomJS_(desired_capabilities={
            'phantomjs.page.settings.resourceTimeout': timeout
        })

        self.log.debug("driver.get(" + url + ")")
        driver.get(url,
                   tag_to_wait='//li[@class="Media-articles-item"]',
                   by="xpath",
                   timeout=timeout)
        self.log.debug("driver.get finished")
        """
		if url == self.conf.getconf("IEEE_top_page"):
			self.log.debug("Wait start.")
			try:
				WebDriverWait(driver, timeout).until(lambda driver: driver.find_element_by_xpath('//li[@class="Media-articles-item"]'))
			except TimeoutException:
				self.log.warning("caught TimeoutException at load the iEEE top page.")
			except NoSuchElementException:
				self.log.warning("caught NoSuchElementException at load the iEEE top page.")

			self.log.debug("Wait Finished.")
		"""
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name +
                       " finished. return driver")
        return driver

    def wait_search_results(self, driver, timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")

        self.log.debug("Wait start.")
        try:
            tag = '//input[@type="checkbox" and @data-group="search-results-group" and @ng-checked="vm.allSelected()"]'
            WebDriverWait(driver, timeout).until(
                lambda driver: driver.find_element_by_xpath(tag))
        except TimeoutException:
            self.log.warning(
                "caught TimeoutException at load the keywords results page.")
            self.log.warning("at " + sys._getframe().f_code.co_name)
            self.log.warning("url[" + driver.current_url + "]")
            self.log.warning("tag[find_element_by_xpath(" + tag + ")")
            filename = "./samples/TimeoutExceptionatLoadtheKeywordsResultsPage." + re.sub(
                r"/|:|\?", "", driver.current_url)
            self.save_current_page(driver, filename + ".png")
            self.save_current_page(driver, filename + ".html")

        except NoSuchElementException:
            self.log.warning(
                "caught NoSuchElementException at load the keywords results page."
            )

        self.log.debug("Wait Finished.")

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")
        return 0

    def search_by_keywords(self,
                           driver,
                           keywords,
                           search_options="default",
                           timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        driver.wait_appearance_of_tag(by="name",
                                      tag='queryText',
                                      timeout=timeout)
        try:
            driver.find_element_by_name('queryText').send_keys(keywords)
            driver.find_element_by_class_name('Search-submit').click()
        except (Exception) as e:
            self.log.exception('[[EXCEPTON OCCURED]]: %s', e)
            sys.exit("[[EXCEPTON OCCURED]]please check logfile.")
        self.wait_search_results(driver, timeout)

        self.set_options(driver, search_options, timeout)

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")
        return 0

    def set_options(self, driver, search_options, timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        try:
            #self.save_current_page(driver, "./samples/before_set_options.png")
            #self.save_current_page(driver, "./samples/before_set_options.html")
            ##show" : "All Resul

            ##PerPage" : "25"
            if search_options.PerPage != 25:
                element = driver.find_element_by_css_selector(
                    'div[ng-model="vm.rowsPerPage"] > div > select')

                #print(len(element))
                #print(element.text)
                Select(element).select_by_visible_text(
                    str(search_options.PerPage))
                self.wait_search_results(driver, timeout)
            #Select(element).select_by_value("object:75")
            ##SortBy" : "MostCit
            ##ContentType" : "No
            ##YearType" : "Range
            ##YearFrom" : "1996"
            ##YearTo" : "2017",
            ##Year" : "2017",
            ##Author" : "None",
            ##Affiliation" : "No
            ##PublicationTitle"
            ##Publisher" : "None
            ##ConferenceLocation
        except NoSuchElementException:
            print("caught NoSuchElementException at get_citing_papers.")
            self.save_current_page(
                driver, "./samples/aNoSuchElementException_in_set_options.png")
            self.save_current_page(
                driver, "./samples/NoSuchElementException_in_set_options.html")

        #self.save_current_page(driver, "./samples/after_set_options.png")
        #self.save_current_page(driver, "./samples/after_set_options.html")

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

    def get_urls_of_papers_in_keywords_page(self,
                                            driver,
                                            PerPage,
                                            num_of_papers="all",
                                            timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")

        if num_of_papers == "all":
            element = driver.find_element_by_css_selector(
                'div[class="pure-u-1-1 Dashboard-header ng-scope"] > span')
            num_of_papers = int(element.text.split(" ")[-1].replace(",", ""))
        self.log.debug("num_of_papers[" + str(num_of_papers) + "]")

        urls = []

        next_button = driver.find_element_by_xpath(
            '//a[@href="" and @ng-click="selectPage(page.number)" and @class="ng-binding"]'
        )
        visited_buttons = [next_button.text]
        while True:
            self.log.debug("get paper urls in current page")
            for i in range(PerPage):
                paper_elements = driver.find_elements_by_xpath(
                    '//div[@class="js-displayer-content u-mt-1 stats-SearchResults_DocResult_ViewMore ng-scope hide"]'
                )
                self.log.debug("i[" + str(i) + "] len(paper_elements)[" +
                               str(len(paper_elements)) + "]")
                driver.execute_script(
                    "window.scrollTo(0, document.body.scrollHeight);")
                if len(paper_elements) == PerPage:
                    break
            self.log.debug("len(paper_elements)[" + str(len(paper_elements)) +
                           "]")

            for paper_element in paper_elements:
                url = paper_element.find_element_by_css_selector(
                    'a').get_attribute("href")
                self.log.debug("url[" + url + "]")
                urls.append(url)
                if len(urls) > num_of_papers:
                    self.log.debug("len(urls)[" + str(len(urls)) +
                                   "] > num_of_papers[" + str(num_of_papers) +
                                   "]. return urls.")
                    return urls

            self.log.debug("search buttons to next page")
            buttons = driver.find_elements_by_xpath(
                '//a[@href="" and @ng-click="selectPage(page.number)" and @class="ng-binding"]'
            )
            i = 0
            for button in buttons:
                self.log.debug("i[" + str(i) + "], button.text[" +
                               button.text + "], visited_buttons:" +
                               str(visited_buttons))
                if not button.text in visited_buttons:
                    next_button = button
                    self.log.debug("break")
                    break
                i += 1
            if i == len(buttons):
                self.log.debug(
                    "i = len(buttons). already visited all buttons. break")
                break

            visited_buttons.append(next_button.text)
            self.log.debug("move to next page[" + next_button.text + "]")
            next_button.click()
            self.wait_search_results(driver, timeout)

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name +
                       " finished. return urls[" + str(len(urls)) + "]")
        return urls

    def get_attributes_and_download_pdf(self,
                                        search,
                                        driver,
                                        path="../../data/tmp/",
                                        filename="title"):
        print(__class__.__name__ + "." + sys._getframe().f_code.co_name +
              " start")
        self.log.info(__class__.__name__ + "." +
                      sys._getframe().f_code.co_name + " start")

        sys.path.append(
            os.path.dirname(os.path.abspath(__file__)) + "/../../lib/db")
        search.times += 1

        timeout = 30
        target_paper_url = search.node

        m = "url[" + target_paper_url + "], times[" + str(
            search.times) + "], limit[" + str(search.limit) + "]"
        print(m)
        self.log.info(m)

        ##reconnect because of http.client.RemoteDisconnected
        #if search.times % 5 == 0:
        #	driver = self.reconnect_driver(driver, driver.current_url)
        #self.save_current_page(driver, "./samples/tmp.png")

        ##if this paper already downloaded, this paper visited and skip.
        #if target_paper_url in search.visited:

        self.move_to_paper_initial_page(driver, target_paper_url)

        import table_papers
        paper = table_papers.Table_papers()

        self.log.debug("get attributes of this paper")
        paper.title = self.get_title(driver)
        paper.authors = self.get_authors(driver)
        paper.keywords = self.get_keywords(driver)
        paper.citings, citing_papers, citing_urls = self.get_citing_papers(
            driver, timeout)
        paper.citeds, cited_papers, cited_urls = self.get_cited_papers(
            driver, timeout)
        paper.conference = self.get_conference(driver)
        paper.published = self.get_date_of_publication(driver)
        paper.url = target_paper_url
        paper.timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
        if filename == "title":
            filename = paper.title + ".pdf"
        paper.path = self.download_a_paper(driver,
                                           path=path,
                                           filename=filename,
                                           timeout=timeout)
        self.log.debug("download finished. wait start.")
        time.sleep(self.conf.getconf("IEEE_wait_time_per_download_paper"))
        self.log.debug("wait finished.")
        paper.id = paper.get_id()

        self.log.debug(paper.get_vars())
        paper.renewal_insert()

        self.log.debug("insert citations of this paper to db")
        import table_citations

        for citing_paper in citing_papers:
            citation = table_citations.Table_citations(start=paper.id,
                                                       end=citing_paper.id)
            citation.renewal_insert()
            citation.close()
        for cited_paper in cited_papers:
            citation = table_citations.Table_citations(start=cited_paper,
                                                       end=paper.id)
            citation.renewal_insert()
            citation.close()

        self.log.debug("check termination of searching loop")
        if 0 < search.limit and search.times >= search.limit:
            self.log.debug("search finished.")
            search.que = [search.node]
            import signal
            driver.service.process.send_signal(
                signal.SIGTERM)  # kill the specific phantomjs child proc
            driver.quit()  # quit the node proc
            return paper, [], []

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")
        self.log.debug("return paper[" + paper.title + "] citing_urls[" +
                       str(citing_urls) + "] cited_urls[" + str(cited_urls) +
                       "]")
        return paper, citing_urls, cited_urls

    def get_title(self, driver):
        return driver.title

    def get_authors(self, driver):
        authors_str = ""
        elements = driver.find_elements_by_xpath(
            '//span[@ng-bind-html="::author.name"]')

        for el in elements:
            authors_str += "," + el.text
        return authors_str[1:]

    def get_keywords(self, driver):
        ##keywords
        keywords_str = ""
        elements = driver.find_elements_by_xpath('//a[@ng-bind-html="::term"]')
        #print(str(len(elements))) #21
        for el in elements:
            keyword = el.text
            if keyword in keywords_str:
                ##todo internet concludes int
                self.log.debug("keyword[" + keyword +
                               "] is deplicated. not add.")
            else:
                keywords_str += "," + el.text
        return keywords_str

    def get_citing_papers(self, driver, timeout=30):
        ##citing_papers
        ##citing_urls
        import table_papers
        citings_str = ""
        citing_papers = []
        citing_urls = []

        try:
            elements = driver.find_elements_by_css_selector(
                'div[ng-repeat="article in vm.contextData.similar"]')
        except NoSuchElementException:
            self.log.warning(
                "caught NoSuchElementException at get_citing_papers.")

        self.log.debug(str(len(elements)))
        #self.save_current_page(driver, "./samples/sample_page_4116687_start.html")
        #self.save_current_page(driver, "./samples/sample_page_4116687_start.png")
        self.log.debug("create arrays of paper and url")

        for el in elements:
            citing_paper = table_papers.Table_papers()
            citing_paper.url = self.conf.getconf(
                "IEEE_website") + el.find_element_by_css_selector(
                    'a').get_attribute("ng-href")
            citing_paper.title = el.find_element_by_css_selector(
                'a').get_attribute("title")
            citing_paper.authors = el.find_element_by_css_selector(
                'div[class="ng-binding"]').text.replace(";", ",")
            timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
            self.log.debug("citing_url[" + citing_paper.url + "]")
            self.log.debug("citing_title[" + citing_paper.title + "]")
            self.log.debug("citing_authors[" + citing_paper.authors + "]")
            self.log.debug(citing_paper.get_vars())

            citing_paper.renewal_insert()
            citing_papers.append(citing_paper)
            citing_urls.append(citing_paper.url)

        return citings_str, citing_papers, citing_urls

    def get_cited_papers(self, driver, timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")

        import table_papers

        citeds_str = ""
        cited_papers = []
        cited_urls = []

        initial_url = driver.current_url
        driver.get(self.convert_paper_url_to_cited_url(initial_url))
        #self.save_current_page(driver, "./samples/sample_page_1055638_start.html")
        #self.save_current_page(driver, "./samples/sample_page_1055638_start.png")

        try:
            div = driver.find_element_by_css_selector(
                'div > section[class="document-all-references ng-scope"] > div[class="ng-scope"] > div[class="strong"]'
            ).text
            if div == "Citations not available for this document.":
                self.log.debug("this paper not cited. return []")
                return citeds_str, cited_papers, cited_urls
            self.log.debug("div=" + div + ", this paper is cited")
        except NoSuchElementException:
            self.log.debug("this paper is cited")

        self.log.debug(
            "WebDriverWait(driver, timeout).until(lambda driver: driver.find_element_by_xpath('//b[@class=ng-binding]' start"
        )

        try:
            WebDriverWait(
                driver,
                timeout).until(lambda driver: driver.find_element_by_xpath(
                    '//b[@class="ng-binding"]'))
        except TimeoutException:
            self.log.warning(
                "caught TimeoutException at load the first cited page.")
            self.move_to_paper_initial_page(driver, initial_url)
            return citeds_str, cited_papers, cited_urls
        except NoSuchElementException:
            self.log.warning(
                "caught NoSuchElementException at load the first cited page.")
            self.move_to_paper_initial_page(driver, initial_url)
            return citeds_str, cited_papers, cited_urls

        self.log.debug("Wait Finished.")

        #self.save_current_page(driver, "./samples/sample_page_1055638_cited.html")
        #self.save_current_page(driver, "./samples/sample_page_1055638_cited.png")

        self.log.debug("continue pushing more view button")
        elements = self.continuous_pushing_more_view_button(driver, timeout)

        self.log.debug("create arrays of paper and url")

        for el in elements:
            cited_url = self.conf.getconf(
                "IEEE_website"
            ) + el.find_element_by_css_selector(
                'div[class="ref-links-container stats-citations-links-container"] > span > a'
            ).get_attribute("ng-href")
            cited_urls.append(cited_url)
            cited_authors, cited_title, cited_conference, cited_date = self.parse_citing(
                el.find_element_by_css_selector(
                    'div[ng-bind-html="::item.displayText"]').text)
            timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
            cited_paper = table_papers.Table_papers(
                title=cited_title,
                authors=cited_authors,
                conference=cited_conference,
                published=cited_date,
                url=cited_url,
                timestamp=timestamp)
            self.log.debug(cited_paper.get_vars())
            cited_paper.renewal_insert()

        #self.save_current_page(driver, "./samples/sample_page_cited_view_more.html")
        #self.save_current_page(driver, "./samples/sample_page_cited_view_more.png")

        self.move_to_paper_initial_page(driver, initial_url)
        #self.save_current_page(driver, "./samples/sample_page_initial.html")
        #self.save_current_page(driver, "./samples/sample_page_initial.png")

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")
        return citeds_str, cited_papers, cited_urls

    def continuous_pushing_more_view_button(self, driver, timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        ##if not cited, load-more-button does not exist.
        ##but if cited, load-more-button always exists nevertheless no more paper,
        ##and the buttons are hidden.

        elements = driver.find_elements_by_css_selector(
            'div[ng-repeat="item in vm.contextData.paperCitations.ieee"] > div[class="pure-g pushTop10"] > div[class="pure-u-23-24"]'
        )
        num_of_viewing = len(elements)
        limit_of_view = self.conf.getconf("IEEE_citation_num_at_first_page")
        self.log.debug("num_of_viewing[" + str(num_of_viewing) +
                       "], limit_of_view[" + str(limit_of_view) + "]")

        while num_of_viewing > limit_of_view - 10:
            limit_of_view += self.conf.getconf(
                "IEEE_citation_num_per_more_view")
            try:
                load_more_button = driver.find_element_by_xpath(
                    '//button[@class="load-more-button" and @ng-click="vm.loadMoreCitations(\'ieee\')"]'
                )
                load_more_button.click()

                WebDriverWait(
                    driver, timeout
                ).until(lambda driver: driver.find_element_by_xpath(
                    '//button[@class="load-more-button" and @ng-click="vm.loadMoreCitations(\'ieee\')" and @aria-disabled="false"]'
                ))
            except TimeoutException:
                m = "caught TimeoutException at loading more cited pages(" + str(
                    limit_of_view) + ") paper[" + driver.current_url + "]."
                print(m)
                self.log.warning(m)
            except NoSuchElementException:
                m = "caught NoSuchElementException at loading more cited pages(" + str(
                    limit_of_view) + ") paper[" + driver.current_url + "]."
                print(m)
                self.log.warning(m)
            except ElementNotVisibleException:
                m = "caught ElementNotVisibleException at loading more cited pages(" + str(
                    limit_of_view
                ) + ") paper[" + driver.current_url + "]. break."
                self.log.debug(m)
                break

            elements = driver.find_elements_by_css_selector(
                'div[ng-repeat="item in vm.contextData.paperCitation"]')
            num_of_viewing = len(elements)
            self.log.debug("num_of_viewing[" + str(num_of_viewing) +
                           "], limit_of_view[" + str(limit_of_view) + "]")

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")
        return elements

    def get_conference(self, driver):
        try:
            return driver.find_element_by_xpath('//div[@class="u-pb-1 stats-document-abstract-doi ng-scope"]')\
                .find_element_by_tag_name('a').text
        except NoSuchElementException:
            return ""

    def get_date_of_publication(self, driver):
        #Date of Publication: 06 January 200 or Date of Conference 14-16 Nov. 2006
        try:
            date = driver.find_element_by_xpath(
                '//div[@ng-if="::vm.details.isJournal == true"]').text
            return self.convert_date_of_publication_to_datetime(date)
        except NoSuchElementException:
            try:
                date = driver.find_element_by_xpath(
                    '//div[@ng-if="::vm.details.isConference == true"]').text
                return self.convert_date_of_publication_to_datetime(date)
            except NoSuchElementException:
                self.log.debug("caught NoSuchElementException. date = None"
                               )  ##todo get from paper??
                driver.save_current_page(
                    "./samples/caughtNoSuchElementExceptionatdate_of_publication.png"
                )
                driver.save_current_page(
                    "./samples/caughtNoSuchElementExceptionatdate_of_publication.html"
                )
                return None

    def move_to_paper_initial_page(self, driver, initial_url, timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        driver.get(initial_url)
        self.log.debug(
            "WebDriverWait(driver, timeout).until(lambda driver: driver.find_element_by_xpath('//div[@ng-repeat=\"article in vm.contextData.similar\"]'))"
        )
        try:
            WebDriverWait(
                driver,
                timeout).until(lambda driver: driver.find_element_by_xpath(
                    '//div[@ng-repeat="article in vm.contextData.similar"]'))
        except TimeoutException:
            self.log.warning(
                "caught TimeoutException at load the paper top page.")
        except NoSuchElementException:
            self.log.warning(
                "caught NoSuchElementException at load the paper top page.")

        self.log.debug("Wait Finished.")

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

    def wait_button_to_pdf_page(self, driver, timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        self.log.debug("Wait start.")
        try:
            WebDriverWait(driver, timeout).until(
                lambda driver: driver.find_element_by_css_selector(
                    'i[class="icon doc-act-icon-pdf"]'))
        except TimeoutException:
            self.log.warning(
                "caught TimeoutException at waiting button which go to pdf page."
            )
        except NoSuchElementException:
            self.log.warning(
                "caught NoSuchElementException at waiting button which go to pdf page."
            )
        self.log.debug("Wait Finished.")
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

    def download_a_paper(self,
                         driver,
                         path="../../data/tmp/",
                         filename="default",
                         timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        initial_url = driver.current_url

        m = "downloading paper to " + path + ". title[" + filename + "]"
        self.log.info(m)
        print(m)

        self.wait_button_to_pdf_page(driver, timeout)

        button = driver.find_element_by_css_selector(
            'i[class="icon doc-act-icon-pdf"]')

        retries = 10
        while retries > 0:
            try:
                button.click()
                self.log.debug("clicked button and no exception. break")
                break
            except (RemoteDisconnected, ConnectionRefusedError, URLError) as e:
                self.log.warning("caught " + e.__class__.__name__ +
                                 " at click download pdf button. retries[" +
                                 str(retries) + "]")
                self.log.warning(e, exc_info=True)
                time.sleep(
                    self.conf.getconf("IEEE_wait_time_per_download_paper"))
                driver.reconnect(initial_url)
                self.wait_button_to_pdf_page(driver, timeout)
                button = driver.find_element_by_css_selector(
                    'i[class="icon doc-act-icon-pdf"]')
                retries -= 1
            except NoSuchElementException:
                self.log.warning(
                    "caught NoSuchElementException at click download pdf button. retries["
                    + str(retries) + "]")
                self.save_current_page(
                    driver,
                    "./samples/caught_NoSuchElementException_at_click_download_pdf_button.html"
                )
                self.save_current_page(
                    driver,
                    "./samples/caught_NoSuchElementException_at_click_download_pdf_button.png"
                )
                retries -= 1
        if retries == 0:
            self.log.error("button.click() error")
            self.save_current_page(driver, "./samples/button_click_error.html")
            self.save_current_page(driver, "./samples/button_click_error.png")

        self.log.debug("Wait start.")
        try:
            WebDriverWait(
                driver,
                timeout).until(lambda driver: driver.find_element_by_xpath(
                    '//frameset[@rows="65,35%"]/frame'))
        except TimeoutException:
            self.log.warning(
                "caught TimeoutException at load the iEEE pdf page.")
            self.log.warning("skip to download pdf. reuturn \"\"")
            driver.get(initial_url)
            return ""
        except NoSuchElementException:
            self.log.warning(
                "caught NoSuchElementException at load the iEEE pdf page.")
            self.log.warning("skip to download pdf. reuturn \"\"")
            driver.get(initial_url)
            return ""
        self.log.debug("Wait Finished.")
        url = driver.find_elements_by_xpath(
            '//frameset[@rows="65,35%"]/frame')[1].get_attribute("src")
        self.log.debug("url:" + url)

        if filename == "default":
            filename = url[:url.index("?")].split("/")[-1]
        filename = filename.replace(":", "")
        self.log.debug("filename:" + filename)
        command = "wget -p \"" + url + "\" -O \"" + path + filename + "\" > /dev/null 2>&1"
        #command = "wget -p \"" + url + "\" -O \"" + path + filename + "\" 1> /dev/null 2>&1"
        #command = "wget -p \"" + url + "\" -O \"" + path + filename + "\""
        self.log.debug(command)
        try:
            self.log.debug(os.system(command))
        except:
            self.log.warning("error at " + command)

        #self.save_current_page(driver, "./samples/7898372.png")
        #self.save_current_page(driver, "./samples/7898372.html")

        driver.get(initial_url)

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")
        self.log.debug("return[" + path + filename + "]")
        return path + filename

    def convert_date_of_publication_to_datetime(self, string):
        ##from
        ##Date of Publication: 06 January 2016
        ##to
        ##2016-01-06
        ##from
        ##Date of Conference: 14-16 Nov. 2006
        ##to
        ##2006-11-14
        ##from
        ##Date of Conference: 27 June-2 July 2016
        ##to
        ##2016-06-27
        ##from
        ##Date of Publication: N/A 2016
        ##to
        ##2016-01-01
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        self.log.debug("string: " + string)
        date = ""
        month = ""
        year = ""
        string = string.replace("\n", "")
        tmp = string.split(":")
        if len(tmp) != 2:
            self.log.warning("len(tmp) != 2")
            self.log.warning("string:" + string)
            return None
        date_month_year = tmp[1].lstrip()
        self.log.debug("date_month_year[" + date_month_year + "]")
        tmp2 = date_month_year.split("-")
        if len(tmp2) >= 3:
            self.log.warning("len(tmp2) >= 3")
            self.log.warning("string:" + string)
            return None
        elif len(tmp2) == 2:
            if re.match("^\d{1,2}$", tmp2[0]):
                date = tmp2[0]
            elif re.match("^\d{1,2}\s[a-zA-Z]", tmp2[0]):
                tmp3 = tmp2[0].split(" ")
                date = tmp3[0]
                month = tmp3[1].replace(".", "")
        tmp4 = date_month_year.split(" ")
        if len(tmp4) < 3:
            self.log.debug("only year")
            self.log.debug("string:" + string)
            date = "1"
            month = "Jan"
        if date == "":
            date = tmp4[-3]
        if month == "":
            month = tmp4[-2].replace(".", "")
        if year == "":
            year = tmp4[-1]

        import datetime
        try:
            month = str(datetime.datetime.strptime(month, '%B').month)
        except ValueError:
            try:
                month = str(datetime.datetime.strptime(month, '%b').month)
            except ValueError:
                if month == "Sept":
                    month = "9"
                else:
                    self.log.warning("ValueError")
                    self.log.warning("string:" + string)
                    self.log.warning("month = 0")
                    month = "0"

        self.log.debug("year[" + year + "], month[" + month + "], date[" +
                       date + "]")

        timestamp = datetime.date(int(year), int(month), int(date))
        return timestamp

    def convert_paper_url_to_cited_url(self, url):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        #from
        #http://ieeexplore.ieee.org/document/4116687/?reload=true
        #to
        #http://ieeexplore.ieee.org/document/4116687/citations?anchor=anchor-paper-citations-ieee&ctx=citations
        self.log.debug("url[" + url + "]")
        converted_url = url.split("?")[
            0] + "citations?anchor=anchor-paper-citations-ieee&ctx=citations"
        self.log.debug("converted_url[" + converted_url + "]")

        return converted_url

    def convert_paper_url_to_pdf_url(self, url):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        ##from
        ##http://ieeexplore.ieee.org/document/6324382/
        ##to
        ##http://ieeexplore.ieee.org/ielx7/35/7901458/07901477.pdf?tp=&arnumber=7901477&isnumber=7901458
        print("url[" + url + "]")

    def parse_citing(self, strings):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        self.log.debug("src_str[" + strings + "]")
        #from
        #Daniel Garant, Wei Lu, "Mining Botnet Behaviors on the Large-Scale Web Application Community", Advanced Information Networking and Applications Workshops (WAINA) 2013 27th International Conference on, pp. 185-190, 2013.
        #to
        #Daniel Garant, Wei Lu,
        #Mining Botnet Behaviors on the Large-Scale Web Application Community
        #Advanced Information Networking and Applications Workshops (WAINA) 2013 27th International Conference on
        #pp. 185-190, 2013
        array = strings.split("\"")
        if len(array) < 3:
            self.log.warning(__class__.__name__ + "." +
                             sys._getframe().f_code.co_name + " warning")
            self.log.warning("strings[" + strings + "]")
            self.log.warning("len(array)(" + str(len(array)) +
                             ") < 3. return \"\", \"\", \"\", \"\"")
            return "", "", "", ""

        authors = array[0]
        title = array[1]
        new_array = array[2][1:].split(",")
        self.log.debug("new_array:" + str(new_array))
        self.log.debug(len(new_array))
        if len(new_array) < 3:
            self.log.warning(__class__.__name__ + "." +
                             sys._getframe().f_code.co_name + " warning")
            self.log.warning("strings[" + strings + "]")
            self.log.warning("len(new_array)(" + str(len(new_array)) +
                             ") < 3. return authors, title, \"\", \"\"")
            return authors, title, "", ""

        elif len(new_array) == 3:
            conference, page, year = new_array
        elif len(new_array) == 4:
            conference, vol, page, year = new_array
        elif len(new_array) == 5:
            conference, vol, page, year, issn = new_array
        else:
            self.log.warning(__class__.__name__ + "." +
                             sys._getframe().f_code.co_name + " warning")
            self.log.warning("strings[" + strings + "]")
            self.log.warning("len(new_array)(" + str(len(new_array)) +
                             ") > 5. return authors, title, \"\", \"\"")
            return authors, title, "", ""
        #self.log.debug("re.match(\"\d*\", " + year + ")")
        #year = re.match("*\d*",year).group() + "-01-01 00:00:00"
        #year += "-01-01 00:00:00"
        self.log.debug("citing year is none")
        year = None
        self.log.debug("authors[" + str(authors) + "], title[" + str(title) +
                       "], conference[" + str(conference) + "], year[" +
                       str(year) + "]")
        return authors, title, conference, year

    def reconnect_driver(self, driver, url):
        self.log.debug("driver reconnect")
        import signal
        driver.service.process.send_signal(
            signal.SIGTERM)  # kill the specific phantomjs child proc
        driver.quit()  # quit the node proc
        driver = self.create_driver(url)
        return driver

    ## for debug
    def print_h2_attributes(self, driver):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        links = driver.find_elements_by_tag_name("h2")
        for link in links:
            print(link.text)
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

    def save_current_page(self, driver, filename):
        self.log.warning(__class__.__name__ + "." +
                         sys._getframe().f_code.co_name + " start")
        self.log.warning("this method will be removed.")
        self.log.warning("please use driver.save_current_page(filename)")

        path, suffix = os.path.splitext(filename)
        self.log.debug("path[" + path + "], suffix[" + suffix + "]")
        if suffix == ".html":
            f = open(filename, 'w')
            f.write(driver.page_source)
            f.close()
        elif suffix == ".png":
            driver.save_screenshot(filename)
        else:
            self.log.error("TYPEERROR suffix[" + suffix + "]")
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

    def show_options(self):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        self.opt.show_options()
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

Пример #13

Показать файл

class Table_papers(Base):
    __tablename__ = 'papers'

    id = Column("id", INTEGER, primary_key=True)
    title = Column("title", TEXT)
    authors = Column("authors", TEXT)
    keywords = Column("keywords", TEXT)
    citings = Column("citings", MEDIUMTEXT)
    citeds = Column("citeds", MEDIUMTEXT)
    conference = Column("conference", TINYTEXT)
    published = Column("published", DATE)
    url = Column("url", TINYTEXT)
    abstract_path = Column("abstract_path", TEXT)
    pdf_path = Column("pdf_path", TEXT)
    timestamp = Column("timestamp", DATETIME)
    label = Column("label", TINYTEXT)
    color = Column("color", TINYTEXT)

    def __init__(self,
                 id="",
                 title="",
                 authors="",
                 keywords="",
                 citings="",
                 citeds="",
                 conference="",
                 published="",
                 url="",
                 timestamp="",
                 abstract_path="",
                 pdf_path="",
                 label="",
                 color=""):
        sys.path.append(
            os.path.dirname(os.path.abspath(__file__)) + "/../lib/utils")
        from conf import Conf
        self.conf = Conf()
        from log import Log as l
        self.log = l.getLogger()
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")

        import mysql_operator
        self.db = mysql_operator.Mysql_operator()

        self.id = id
        self.title = title
        self.authors = authors
        self.keywords = keywords
        self.citings = citings
        self.citeds = citeds
        self.conference = conference
        if published == "":
            self.published = None
        else:
            self.published = published
        self.url = url
        if timestamp == "":
            self.timestamp = None
        else:
            self.timestamp = timestamp
        self.abstract_path = abstract_path
        self.pdf_path = pdf_path
        self.label = label
        self.color = color

    def __repr__(self):
        return 'Table_papers'

    def insert(self):
        if self.id == "":
            self.id = self.get_id()
        vars_to_encode = [
            "title", "authors", "keywords", "abstract_path", "pdf_path"
        ]
        for var in vars_to_encode:
            if eval("self." + var) is not None:
                exec("self." + var + " = self." + var +
                     ".encode('utf-8', 'replace')")
        self.db.insert(self)
        for var in vars_to_encode:
            if eval("self." + var) is not None:
                exec("self." + var + " = self." + var +
                     ".decode('utf-8', 'replace')")

        self.db.session.expunge(self)
        self.close()

    def has_already_downloaded(self):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        self.log.debug("paper.title[" + self.title + "]")
        if self.conf.getconf("IEEE_paper_download_period") <= 0:
            self.log.debug("IEEE_paper_download_period <= 0, return False")
            return False
        records = self.db.session.query(__class__).filter(
            __class__.title == self.title.encode('utf-8', 'replace')).all()
        if len(records) == 0:
            self.log.debug("This paper doesnt exist in db. return false")
            return False
        elif len(records) >= 2:
            self.log.warning("need to merge records")
            self.log.warning("title[" + self.title + "], len(records)[" +
                             str(len(records)) + "]")

        self.log.debug("This paper exist in db. Number of records is [" +
                       str(len(records)) + "]")
        if records[0].abstract_path == "":
            self.log.debug("but the abstract not downloaded. return False")
            return False
        self.log.debug(
            "and the abstract already downloaded. compare timestamps")

        limit = datetime.datetime.now() - timedelta(
            days=self.conf.getconf("IEEE_paper_download_period"))
        self.log.debug("limit[" + str(limit) + "], records[" +
                       str(records[0].timestamp) + "]")
        if limit > records[0].timestamp:
            self.log.debug("should renew db. return false")
            return False
        else:
            self.log.debug("recently downloaded. clone paper and return true")
            clone_vars = [
                "authors", "keywords", "citings", "citeds", "conference",
                "published", "url", "timestamp", "abstract_path", "pdf_path",
                "label", "color"
            ]
            for var in clone_vars:
                exec("self." + var + "= records[0]." + var)
            self.close()
            return True

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

    def renewal_insert(self):
        self.log.info(__class__.__name__ + "." +
                      sys._getframe().f_code.co_name + " start")

        #check duplication and insert
        records = self.db.session.query(__class__).filter(
            __class__.title == self.title.encode('utf-8')).all()
        if len(records) == 0:  #new record
            self.insert()
            return 0

        merge_id_list = []
        for record in records:
            merge_id_list.append(record.id)

        vars = [
            "authors", "keywords", "citings", "citeds", "conference",
            "published", "url", "abstract_path", "pdf_path", "label", "color"
        ]
        for var in vars:
            for record in records:
                self.log.debug("record.id[" + str(record.id) + "]")
                self.log.debug("var[" + var + "], self[" +
                               str(eval("self." + var)) + "], record[" +
                               str(eval("record." + var)) + "]")
                tmp_timestamp = self.timestamp

                if eval("record." + var) == None or eval("record." +
                                                         var) == "":
                    self.log.debug("records." + var + " == None")
                elif eval("self." + var) == None or eval("self." + var) == "":
                    self.log.debug("self." + var + " == None")
                    #tmp = eval("self." + var)
                    #tmp = eval("record." + var)
                    exec("self." + var + " = record." + var)
                    self.log.debug("->var[" + var + "], self[" +
                                   str(eval("self." + var)) + "], record[" +
                                   str(eval("record." + var)) + "]")
                    tmp_timestamp = record.timestamp
                else:
                    self.log.debug(var + " is not none. compare timestamps")
                    ## todo: check type(timestamp)
                    if tmp_timestamp == None or record.timestamp == None or self.compare_timestamps(
                            old=tmp_timestamp, new=record.timestamp):
                        ##if record.timestamp is newer
                        exec("self." + var + " = record." + var)
                        self.log.debug("->var[" + var + "], self[" +
                                       str(eval("self." + var)) +
                                       "], record[" +
                                       str(eval("record." + var)) + "]")
                        tmp_timestamp = record.timestamp
                    #except:
                    #m = "caught exception at tmp_timestamp[" + str(tmp_timestamp) + "] < record.timestamp[" + str(record.timestamp) + "]"
                    #self.log.warning(m)
                    #print(m)

        for record in records:
            self.db.delete(record)
        self.id = self.get_id()
        import time
        self.timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
        self.insert()

        ##merge citations
        self.log.debug("merge[" + str(merge_id_list) + "] to self.id[" +
                       str(self.id) + "]")
        for merge_id in merge_id_list:
            from table_citations import Table_citations
            from sqlalchemy import and_, or_
            merge_records = self.db.session.query(Table_citations).filter(
                or_(Table_citations.start == merge_id,
                    Table_citations.end == merge_id)).all()
            self.log.debug("id[" + str(merge_id) + "].records[" +
                           str(len(merge_records)) + "]")
            for merge_record in merge_records:
                self.merge_citations(merge_record,
                                     merge_id_list,
                                     survival_id=self.id,
                                     delete_id=merge_id)

        self.close()

    def merge_citations(self, merge_record, merge_id_list, survival_id,
                        delete_id):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        self.log.debug("from[" + str(merge_record.start) + "]to[" +
                       str(merge_record.end) + "]")
        self.log.debug("survival_id[" + str(survival_id) + "], delete_id[" +
                       str(delete_id) + "]")
        from table_citations import Table_citations

        ##is delete_id start or end?
        if merge_record.start in merge_id_list and merge_record.end in merge_id_list:
            self.log.debug("start[" + str(merge_record.start) + "] and end[" +
                           str(merge_record.end) + "] are merge_id. delete.")
            self.log.debug("delete(merge_record)")
            self.db.delete(merge_record)

        #elif merge_record.start == delete_id and not merge_record.end in merge_id_list:
        elif merge_record.start == delete_id:
            self.log.debug("start[" + str(delete_id) + "] is delete_id. end[" +
                           str(merge_record.end) + "]")
            self.log.debug("delete(merge_record)")
            self.db.delete(merge_record)
            citation = Table_citations(start=survival_id, end=merge_record.end)
            citation.renewal_insert()
            citation.close()
        #elif merge_record.end == delete_id and not merge_record.end in merge_id_list:
        elif merge_record.end == delete_id:
            self.log.debug("end[" + str(merge_record.end) +
                           "] is delete_id. start[" + str(merge_record.start) +
                           "]")
            citation = Table_citations(start=merge_record.start,
                                       end=survival_id)
            self.log.debug("delete(merge_record)")
            self.db.delete(merge_record)
            citation.renewal_insert()
            citation.close()

    def compare_timestamps(self, old, new):
        self.log.debug("compare old_timestamp[" + str(old) + "] < new[" +
                       str(new) + "]?")
        old_str = str(old)
        new_str = str(new)
        if old_str < new_str:
            self.log.debug("return true")
            return True
        else:
            self.log.debug("return false")
            return False

    def get_citings_array(self):
        return self.citings.split(",")

    def get_citeds_array(self):
        return self.citeds.split(",")

    def get_id(self):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")

        ##when the records which have same title exist,
        ##the id is smallest one of records.
        records = self.db.session.query(__class__).filter(
            __class__.title == self.title.encode('utf-8')).all()
        if len(records) == 0:  #new record
            return self._get_available_id()

        id = records[0].id
        for record in records:
            if id > record.id:
                id = record.id
        return id

    def _get_available_id(self):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        previous_id = 0
        for q in self.db.session.query(__class__).order_by(__class__.id):
            if q.id - previous_id >= 2:
                self.log.debug("id[" + str(q.id) + "] - previous_id[" +
                               str(previous_id) + "] > 2. return " +
                               str(previous_id + 1))
                return previous_id + 1
            previous_id = q.id
        self.log.debug("for loop ended. return " + str(previous_id + 1))
        return previous_id + 1

    def close(self):
        self.db.session.close()
        self.db.close()

    def get_vars(self):
        return ("{" + "id: " + str(self.id) + ", " + "title: " + self.title +
                ", " + "authors: " + self.authors + ", " + "keywords: " +
                self.keywords + ", " + "citings: " + self.citings + ", " +
                "citeds: " + self.citeds + ", " + "conference: " +
                self.conference + ", " + "published: " + str(self.published) +
                ", " + "url: " + self.url + ", " + "timestamp: " +
                str(self.timestamp) + ", " + "abstract_path: " +
                self.abstract_path + ", " + "pdf_path: " + self.pdf_path +
                ", " + "label: " + self.label + ", " + "color: " + self.color +
                ", " + "}")

Пример #14

Показать файл

Файл: old3.IEEEXplore.py Проект: tilmitt11191/paper_graph

class IEEEXplore:
    def __init__(self):
        sys.path.append(
            os.path.dirname(os.path.abspath(__file__)) + "/../lib/utils")
        from conf import Conf
        self.conf = Conf()
        from log import Log as l
        self.log = l.getLogger()

        self.opt = Search_options()
        self.log.debug("class " + __class__.__name__ + " created.")

    def get_papers_of_new_conferences(self, conference_num):
        self.log.info(__class__.__name__ + "." +
                      sys._getframe().f_code.co_name + "(conference_num=" +
                      str(conference_num) + ") start.")

    def get_papers_by_keywords(self, keywords, num_of_papers):
        self.log.info(__class__.__name__ + "." +
                      sys._getframe().f_code.co_name + " start")
        self.log.info("keywords[" + keywords + "], num_of_papers[" +
                      str(num_of_papers) + "]")

        driver = self.create_driver()
        self.search_by_keywords(driver, keywords)
        urls = self.get_urls_of_papers_in_keywords_page(driver, num_of_papers)
        all_papers = []
        all_citing_urls = []
        all_cited_urls = []
        """
		for url in urls:
			driver.get(url)
			paper, citing_urls, cited_urls = self.get_attributes_and_download_pdf(driver)
			all_papers.append(paper)
			all_citing_urls.append(citing_urls)
			all_cited_urls = (cited_urls)
			self.log.info(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished")
		"""
        return all_papers, all_cited_urls, all_citing_urls

    def get_papers_of_target_conference(self, conference_name):
        pass

    def create_driver(self, top_page=""):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")

        phantomjs_path = self.conf.getconf("phantomJS_pass")
        if top_page == "":
            top_page = self.conf.getconf("IEEE_top_page")
        from selenium import webdriver
        driver = webdriver.PhantomJS(phantomjs_path)
        self.log.debug("driver.get(" + top_page + ")")
        driver.get(top_page)

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name +
                       " finished. return driver")
        return driver

    def search_by_keywords(self, driver, keywords):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        try:
            driver.find_element_by_name('queryText').send_keys(keywords)
            driver.find_element_by_class_name('Search-submit').click()
        except (Exception) as e:
            self.log.exception('[[EXCEPTON OCCURED]]: %s', e)
            sys.exit("[[EXCEPTON OCCURED]]please check logfile.")
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

    def set_options(self):
        pass

    def get_urls_of_papers_in_keywords_page(self, driver, num_of_papers):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        urls = []
        links = driver.find_elements_by_class_name("pure-u-22-24")
        self.log.debug("len(links)[" + str(len(links)) + "]")

        for link in links:
            element = link.find_element_by_css_selector("h2 > a")
            urls.append(element.get_attribute("href"))

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished. return " +
                       str(urls))
        return urls

    def get_attributes_and_download_pdf(self, search, driver):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")

        sys.path.append(
            os.path.dirname(os.path.abspath(__file__)) + "/../../lib/db")
        timeout = 30
        target_paper_url = search.node

        self.log.info("url[" + target_paper_url + "], times[" +
                      str(search.times) + "], limit[" + str(search.limit) +
                      "]")
        #if this paper already downloaded, thid paper visited and skip.

        driver.get(target_paper_url)
        self.log.debug(
            "WebDriverWait(driver, timeout).until(lambda driver: driver.find_element_by_xpath('//div[@ng-repeat=\"article in vm.contextData.similar\"]'))"
        )

        try:
            WebDriverWait(
                driver,
                timeout).until(lambda driver: driver.find_element_by_xpath(
                    '//div[@ng-repeat="article in vm.contextData.similar"]'))
        except TimeoutException:
            m = "caught TimeoutException at load the paper top page."
            print(m)
            self.log.warning(m)
        except NoSuchElementException:
            m = "caught NoSuchElementException at load the paper top page."
            print(m)
            self.log.warning(m)

        self.log.debug("Wait Finished.")

        import table_papers
        paper = table_papers.Table_papers()

        self.log.debug("get attributes of this paper")
        #paper.title = self.get_title(driver)
        #paper.authors = self.get_authors(driver)
        #paper.keywords = self.get_keywords(driver)
        #citing_urls = []
        paper.citings, citing_papers, citing_urls = self.get_citing_papers(
            driver, timeout)
        cited_urls = []
        #paper.citeds, cited_papers, cited_urls = self.get_cited_papers(driver, timeout)
        #paper.conference = self.get_conference(driver)
        #paper.published = self.get_date_of_publication(driver)
        #paper.url = target_paper_url
        import time
        paper.timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
        ##path

        #paper.renewal_insert()
        print(paper.get_vars())

        self.log.debug("insert citations of this paper to db")
        import table_citations

        for citing_paper in citing_papers:
            citation = table_citations.Table_citations(start=paper.id,
                                                       end=citing_paper.id)
            citation.renewal_insert()
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")
        return paper, citing_urls, cited_urls

    def get_title(self, driver):
        #element = driver.find_element_by_tag_name("title")
        #element = driver.find_element_by_id("title")
        #element = driver.find_element_by_css_selector("html > title")
        #element = driver.find_element_by_class_name("title")
        return driver.title

    def get_authors(self, driver):
        ##authors
        #<span ng-bind-html="::author.name" class="ng-binding">
        #elements = driver.find_elements_by_class_name("authors-container")
        #print(str(len(elements)))
        authors_str = ""
        elements = driver.find_elements_by_xpath(
            '//span[@ng-bind-html="::author.name"]')
        #print(str(len(elements))) #5
        for el in elements:
            authors_str += "," + el.text
        return authors_str[1:]

    def get_keywords(self, driver):
        ##keywords
        keywords_str = ""
        elements = driver.find_elements_by_xpath('//a[@ng-bind-html="::term"]')
        #print(str(len(elements))) #21
        for el in elements:
            keyword = el.text
            if keyword in keywords_str:
                ##todo internet concludes int
                self.log.debug("keyword[" + keyword +
                               "] is deplicated. not add.")
            else:
                keywords_str += "," + el.text
        return keywords_str

    def get_citing_papers(self, driver, timeout=30):
        ##citing_papers
        ##citing_urls
        """
					<a ng-href="/document/4116687" title="Usilng Machine Learning Technliques to Identify Botnet Traffic" target="_self" href="/document/4116687">
				<span ng-bind-html="::(vm.contextData.isStandard ? article.standardNumber + ' - ' + article.title : article.title) | charlimitHtml:185" mathjax-bind="" class="ng-binding">Usilng Machine Learning Technliques to Identify Botnet Traffic</span>
			</a>
			<div class="ng-binding">Carl Livadas; Robert Walsh; David Lapsley; W. Timothy Strayer</div>
		</div><!-- end ngRepeat: article in vm.contextData.similar --><div class="doc-all-related-articles-list-item ng-scope" ng-repeat="article in vm.contextData.similar"> 
		"""
        import table_papers
        citings_str = ""
        citing_papers = []
        citing_urls = []

        self.log.debug(
            "WebDriverWait(driver, timeout).until(lambda driver: driver.find_elements_by_css_selector('div[ng-repeat=\"article in vm.contextData.similar\"] > a')) start"
        )

        try:
            WebDriverWait(driver, timeout).until(
                lambda driver: driver.find_elements_by_css_selector(
                    'div[ng-repeat="article in vm.contextData.similar"] > a'))
        except TimeoutException:
            m = "caught TimeoutException at load the paper top page."
            print(m)
            self.log.warning(m)
            return citings_str, citing_papers, citing_urls
        except NoSuchElementException:
            m = "caught NoSuchElementException at load the paper top page."
            print(m)
            self.log.warning(m)
            return citings_str, citing_papers, citing_urls

        self.log.debug("Wait Finished.")

        elements = driver.find_elements_by_css_selector(
            'div[ng-repeat="article in vm.contextData.similar"]')

        print(str(len(elements)))
        self.save_current_page(driver,
                               "./samples/sample_page_4116687_start.html")
        self.save_current_page(driver,
                               "./samples/sample_page_4116687_start.png")
        print("create arrays of paper and url")

        for el in elements:
            citing_paper = table_papers.Table_papers()
            citing_paper.url = self.conf.getconf(
                "IEEE_website") + el.find_element_by_css_selector(
                    'a').get_attribute("ng-href")
            citing_paper.title = el.find_element_by_css_selector(
                'a').get_attribute("title")
            citing_paper.authors = el.find_element_by_css_selector(
                'div[class="ng-binding"]').text.replace(";", ",")
            import time
            timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
            print("citing_url[" + citing_paper.url + "]")
            print("citing_title[" + citing_paper.title + "]")
            print("citing_authors[" + citing_paper.authors + "]")
            print(citing_paper.get_vars())

            citing_paper.renewal_insert()
            citing_papers.append(citing_paper)
            citing_urls.append(citing_paper.url)

        return citings_str, citing_papers, citing_urls

    def get_cited_papers(self, driver, timeout=30):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")

        import table_papers

        citeds_str = ""
        cited_papers = []
        cited_urls = []

        #href="/document/4116687/citations?tabFilter=papers"
        #http://ieeexplore.ieee.org/document/4116687/citations?anchor=anchor-paper-citations-ieee&ctx=citations
        initial_url = driver.current_url
        driver.get(self.convert_paper_url_to_cited_url(initial_url))
        self.save_current_page(driver,
                               "./samples/sample_page_1055638_start.html")
        self.save_current_page(driver,
                               "./samples/sample_page_1055638_start.png")
        """
		<div ng-if="!vm.loading &amp;&amp; !vm.details.paperCitations.ieee &amp;&amp; !vm.details.paperCitations.nonIeee &amp;&amp; !vm.details.patentCitations" class="ng-scope" style="">
		Citations are not available for this document.
	</div>
		"""
        #el = driver.find_element_by_xpath('//div[@ng-if="!vm.loading &amp;&amp; !vm.details.paperCitations.ieee &amp;&amp; !vm.details.paperCitations.nonIeee &amp;&amp; !vm.details.patentCitations"')
        #els = driver.find_elements_by_xpath('//div[@class="ng-scope" and @style=""]') #ok. got els
        #els = driver.find_elements_by_xpath('//div[@ng-if="::!vm.contextData.paperCitations.ieee &amp;&amp; !vm.contextData.paperCitations.nonIeee &amp;&amp; !vm.contextData.patentCitations"]') #0
        #><div ng-if="::!vm.contextData.paperCitations.ieee &amp;&amp; !vm.contextData.paperCitations.nonIeee &amp;&amp; !vm.contextData.patentCitations" class="ng-scope">
        #els = driver.find_elements_by_xpath('//div[@ng-if="::!vm.contextData.paperCitations.ieee"]') #0
        try:
            div = driver.find_element_by_css_selector(
                'div > section[class="document-all-references ng-scope"] > div[class="ng-scope"] > div[class="strong"]'
            ).text
            if div == "Citations not available for this document.":
                self.log.debug("this paper not cited. return []")
                return citeds_str, cited_papers, cited_urls
            self.log.debug("div=" + div + ", this paper is cited")
        except NoSuchElementException:
            self.log.debug("this paper is cited")
        """
		try:
			driver.find_element_by_name('queryText').send_keys(keywords)
			driver.find_element_by_class_name('Search-submit').click()
		except(Exception) as e:
			self.log.exception('[[EXCEPTON OCCURED]]: %s', e)
			sys.exit("[[EXCEPTON OCCURED]]please check logfile.")
			
		document-banner-metric ng-scope
		ui-sref="document.full({tab:'citations', q:null, ctx:null, section:null, part:null, anchor:null, tabFilter: 'papers'})"
		#self.save_current_page(driver, "./samples/sample_page2.html")
		self.save_current_page(driver, "./samples/sample_page2.png")
	<button class="load-more-button" type="button" ng-click="vm.loadMoreCitations('patent')" ng-disabled="vm.loading" tabindex="0" aria-disabled="false">
				<span ng-show="!vm.loading" aria-hidden="false" class="">View More</span>
				<i class="fa fa-spinner fa-spin ng-hide" ng-show="vm.loading" aria-hidden="true"></i>
		"""
        self.log.debug(
            "WebDriverWait(driver, timeout).until(lambda driver: driver.find_element_by_xpath('//b[@class=ng-binding]' start"
        )

        try:
            WebDriverWait(
                driver,
                timeout).until(lambda driver: driver.find_element_by_xpath(
                    '//b[@class="ng-binding"]'))
        except TimeoutException:
            m = "caught TimeoutException at load the first cited page."
            print(m)
            self.log.warning(m)
            driver.get(initial_url)
            return citeds_str, cited_papers, cited_urls
        except NoSuchElementException:
            m = "caught NoSuchElementException at load the first cited page."
            print(m)
            self.log.warning(m)
            driver.get(initial_url)
            return citeds_str, cited_papers, cited_urls

        self.log.debug("Wait Finished.")

        #self.save_current_page(driver, "./samples/sample_page_1055638_cited.html")
        #self.save_current_page(driver, "./samples/sample_page_1055638_cited.png")

        elements = driver.find_elements_by_css_selector(
            'div[ng-repeat="item in vm.contextData.paperCitations.ieee"] > div[class="pure-g pushTop10"] > div[class="pure-u-23-24"]'
        )
        num_of_viewing = len(elements)
        limit_of_view = self.conf.getconf("IEEE_citation_num_at_first_page")
        print("num_of_viewing[" + str(num_of_viewing) + "], limit_of_view[" +
              str(limit_of_view) + "]")

        print("continue pushing more view button")
        ##if not cited, load-more-button does not exist.
        ##but if cited, load-more-button always exists nevertheless no more paper,
        ##and the buttons are hidden.
        while num_of_viewing > limit_of_view - 10:
            limit_of_view += self.conf.getconf(
                "IEEE_citation_num_per_more_view")
            load_more_button = driver.find_element_by_xpath(
                '//button[@class="load-more-button" and @ng-click="vm.loadMoreCitations(\'ieee\')"]'
            )
            load_more_button.click()
            try:
                WebDriverWait(
                    driver, timeout
                ).until(lambda driver: driver.find_element_by_xpath(
                    '//button[@class="load-more-button" and @ng-click="vm.loadMoreCitations(\'ieee\')" and @aria-disabled="false"]'
                ))
            except TimeoutException:
                m = "caught TimeoutException at loading more cited pages(" + str(
                    limit_of_view) + ") paper[" + driver.current_url + "]."
                print(m)
                self.log.warning(m)
            except NoSuchElementException:
                m = "caught NoSuchElementException at loading more cited pages(" + str(
                    limit_of_view) + ") paper[" + driver.current_url + "]."
                print(m)
                self.log.warning(m)
            #elements = driver.find_elements_by_css_selector('div[ng-repeat="item in vm.contextData.paperCitations.ieee"] > div > div > b[class="ng-binding"]')
            elements = driver.find_elements_by_css_selector(
                'div[ng-repeat="item in vm.contextData.paperCitations.ieee"] > div[class="pure-g pushTop10"] > div[class="pure-u-23-24"]'
            )
            #self.save_current_page(driver, "./samples/sample_page_1055638_cited_"+str(limit_of_view)+".html")
            #self.save_current_page(driver, "./samples/sample_page_1055638_cited_"+str(limit_of_view)+".png")
            num_of_viewing = len(elements)
            print("num_of_viewing[" + str(num_of_viewing) +
                  "], limit_of_view[" + str(limit_of_view) + "]")

        print("cited loop finished. num_of_viewing[" + str(num_of_viewing) +
              "], limit_of_view[" + str(limit_of_view) + "]")

        print("create arrays of paper and url")

        for el in elements:
            cited_url = self.conf.getconf(
                "IEEE_website"
            ) + el.find_element_by_css_selector(
                'div[class="ref-links-container stats-citations-links-container"] > span > a'
            ).get_attribute("ng-href")
            cited_urls.append(cited_url)
            cited_authors, cited_title, cited_conference, cited_date = self.parse_citing(
                el.find_element_by_css_selector(
                    'div[ng-bind-html="::item.displayText"]').text)
            import time
            timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
            cited_paper = table_papers.Table_papers(
                title=cited_title,
                authors=cited_authors,
                conference=cited_conference,
                published=cited_date,
                url=cited_url,
                timestamp=timestamp)
            print(cited_paper.get_vars())
            cited_paper.renewal_insert()

        #self.save_current_page(driver, "./samples/sample_page_1055638_cited_view_more.html")
        #self.save_current_page(driver, "./samples/sample_page_1055638_cited_view_more.png")

        driver.get(initial_url)
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")
        return citeds_str, cited_papers, cited_urls

    def get_conference(self, driver):
        conference = driver.find_element_by_xpath('//div[@class="u-pb-1 stats-document-abstract-doi ng-scope"]')\
             .find_element_by_tag_name('a').text

        return conference

    def get_date_of_publication(self, driver):
        #Date of Publication: 06 January 200 or Date of Conference 14-16 Nov. 2006
        try:
            date = driver.find_element_by_xpath(
                '//div[@ng-if="::vm.details.isConference == true"]').text
        except NoSuchElementException:
            self.log.debug(
                "catch NoSuchElementException. date = ''")  ##todo paper
            date = ""
        return self.convert_to_datetime(date)

    def download_a_paper(self, driver, path="../../data/tmp/"):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        initial_url = driver.current_url
        button = driver.find_element_by_css_selector(
            'i[class="icon doc-act-icon-pdf]')
        button.click()
        self.save_current_page(driver,
                               "./samples/sample_page_7849067_pdf_click.html")
        self.save_current_page(driver,
                               "./samples/sample_page_7849067_pdf_click.png")
        driver.get(initial_url)

        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

    def download_papers_by_keywords(self, driver, path, download_num=25):
        # 0:デスクトップ、1:システム規定のフォルファ、2:ユーザ定義フォルダ
        #driver.setPreference("browser.download.folderList",2)
        # 上記で2を選択したのでファイルのダウンロード場所を指定
        #driver.setPreference("browser.download.dir", path)
        # ダウンロード完了時にダウンロードマネージャウィンドウを表示するかどうかを示す真偽値。
        #driver.setPreference("browser.download.manager.showWhenStarting",False)
        links = driver.find_elements_by_class_name("pure-u-22-24")
        self.log.debug("len(links)[" + str(len(links)) + "]")
        i = 0
        for link in links:
            self.log.debug("txt:" + link.text)
            element = link.find_element_by_css_selector("h2 > a")
            pdf_title = element.text
            self.log.debug("pdf_title:" + pdf_title)
            pdf_url = self.convert_path_to_url(element.get_attribute("href"))
            self.log.debug("pdf_dir:" + pdf_url)

            element = link.find_element_by_css_selector("p")
            pdf_authors = link.find_element_by_css_selector("p").text.split(
                "; ")
            self.log.debug("pdf_author:" + str(pdf_authors))

            print("pdf_title:" + pdf_title)
            print("pdf_dir:" + pdf_url)
            print("pdf_author:" + str(pdf_authors))

            i += 1
            if i >= download_num:
                self.log.debug("i>=" + str(download_num) + "." +
                               __class__.__name__ + "." +
                               sys._getframe().f_code.co_name + " finished.")
                return 0
        self.log.debug("len(link)<" + str(download_num) + "." +
                       __class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished.")
        return 0

    """	
	def get_papers_with_breadth_first_search(self, root_url_of_paper):
		
		import math
		math.breadth_first_search(root_url_of_paper, get_citing_papers() )

		self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " start")
		self.log.debug("root_url_of_paper["+root_url_of_paper+"]")
		
		citing_urls, cited_urls = ***
		
		for url in citing_urls:
			self.get_papers_with_breadth_first_search(url)

		self.log.debug(__class__.__name__ + "." + sys._getframe().f_code.co_name + " finished")
	"""

    def convert_to_datetime(self, str):
        self.log.warning("!!!incomplete method[" + __class__.__name__ + "." +
                         sys._getframe().f_code.co_name + "]!!!")
        import time
        timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
        return timestamp

    def convert_paper_url_to_cited_url(self, url):
        #from
        #http://ieeexplore.ieee.org/document/4116687/?reload=true
        #to
        #http://ieeexplore.ieee.org/document/4116687/citations?anchor=anchor-paper-citations-ieee&ctx=citations
        return url.split("?")[
            0] + "citations?anchor=anchor-paper-citations-ieee&ctx=citations"

    def parse_citing(self, str):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        self.log.debug("src_srt[" + str + "]")
        #from
        #Daniel Garant, Wei Lu, "Mining Botnet Behaviors on the Large-Scale Web Application Community", Advanced Information Networking and Applications Workshops (WAINA) 2013 27th International Conference on, pp. 185-190, 2013.
        #to
        #Daniel Garant, Wei Lu,
        #Mining Botnet Behaviors on the Large-Scale Web Application Community
        #Advanced Information Networking and Applications Workshops (WAINA) 2013 27th International Conference on
        #pp. 185-190, 2013
        array = str.split("\"")
        if len(array) < 3:
            self.log.warning(__class__.__name__ + "." +
                             sys._getframe().f_code.co_name + " warning")
            self.log.warning("str[" + str + "]")
            self.log.warning("len(array)(" + str(len(array)) +
                             ") < 3. return \"\", \"\", \"\", \"\"")
            return "", "", "", ""

        authors = array[0]
        title = array[1]
        new_array = array[2][1:].split(",")
        print(len(new_array))
        if len(new_array) < 3:
            self.log.warning(__class__.__name__ + "." +
                             sys._getframe().f_code.co_name + " warning")
            self.log.warning("str[" + str + "]")
            self.log.warning("len(new_array)(" + str(len(new_array)) +
                             ") < 3. return authors, title, \"\", \"\"")
            return authors, title, "", ""

        elif len(new_array) == 3:
            conference, page, year = new_array
        elif len(new_array) == 4:
            conference, vol, page, year = new_array
        elif len(new_array) == 5:
            conference, vol, page, year, issn = new_array
        else:
            self.log.warning(__class__.__name__ + "." +
                             sys._getframe().f_code.co_name + " warning")
            self.log.warning("str[" + str + "]")
            self.log.warning("len(new_array)(" + str(len(new_array)) +
                             ") > 5. return authors, title, \"\", \"\"")
            return authors, title, "", ""

        return authors, title, conference, year

    ## for debug
    def print_h2_attributes(self, driver):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        links = driver.find_elements_by_tag_name("h2")
        for link in links:
            print(link.text)
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

    def save_current_page(self, driver, filename):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        path, suffix = os.path.splitext(filename)
        self.log.debug("path[" + path + "], suffix[" + suffix + "]")
        if suffix == ".html":
            f = open(filename, 'w')
            f.write(driver.page_source)
            f.close()
        elif suffix == ".png":
            driver.save_screenshot(filename)
        else:
            self.log.error("TYPEERROR suffix[" + suffix + "]")
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

    def show_options(self):
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " start")
        self.opt.show_options()
        self.log.debug(__class__.__name__ + "." +
                       sys._getframe().f_code.co_name + " finished")

Пример #15

Показать файл

	def test_var(self):
		from conf import Conf
		print("IEEE_website["+Conf.getconf("IEEE_website")+"]")
		print("IEEE_top_page["+Conf.getconf("IEEE_top_page")+"]")
		paper_url = Conf.getconf("IEEE_website") + "/document/6550394"
		print("paper_url[" + paper_url + "]")

Пример #16

Показать файл

	def test_conf(self):
		from conf import Conf
		print("loglevel["+Conf.getconf("loglevel")+"]")

Пример #17

Показать файл

Файл: get_cheapest_pdf.py Проект: tilmitt11191/kakaku

import datetime

sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../lib/utils")
from conf import Conf
from log import Log

sys.path.append(
    os.path.dirname(os.path.abspath(__file__)) + "/../src/scraping")
from kakaku import Kakaku

log = Log.getLogger()
kakaku = Kakaku()

args = sys.argv[1:]
print("products: " + str(args))
log_dir = Conf.getconf("product_log_dir")
for arg in args:
    log.info("target product name[" + str(arg) + "]")
    product_name = arg
    log_name = str(datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + "_" +
                   re.sub(" |/", "_", product_name) + ".log")
    #log_name = "product.log"
    #product_log = Log.getLogger(logfile=log_dir+log_name)
    product_log = log
    print("product: " + product_name + ", save log to: " + log_dir + log_name)
    try:
        kakaku.save_cheapest_pdf(product_name, logger=product_log)
    except Exception as e:
        print("Faild. caught " + e.__class__.__name__ +
              " exception. Please retry [" + product_name + "]")
        print(e)