Пример #1
0
def fill_form(driver, search_argument):
    driver.find_element_by_class_name('one-way-option').click()
    print('success click one way')
    driver.find_element_by_name('destination_from')
    #value 11 =PB, 12 = SRG, 13=LMB, 8=GA, 10=GT,   9=TK. worth separate it using class
    print('sucess click destination from')
    select.select_by_value('12') #worth to be input
    print('success selecting destination value')
    driver.find_element_by_name('destination_to')
    print('success selecting destination to')
    #worth separate it into class
    select.select_by_value('10') #worth to be input
    print('success selecting destination value')
    #make a class perhaps so that its easily customisable
    driver.find_element_by_name('depart_date')
    print('success choosing depart date')
    search_field_depart_date = driver.find_element_by_name('depart_date').click()
    #wait until date is available
    wait = WebDriverWait(driver, timeout=3).until(
        EC.presence_of_elements_located((By.CLASS_NAME, 'ui-datepicker-calendar')))
    
    #select datepicker
    # calendar = driver.find_element_by_name('ui-datepicker-calendar')    
    #find some date picker and fill in, worth to have a for loop or dependency injection?, class='ui-datepicker-month, class='ui-datepicker-year', class='ui-datepicker-calendar' <td class=" " data-handler="selectDay" data-event="click" data-month="7" data-year="2019"><a class="ui-state-default" href="#">5</a></td>
    #select.input[name=depart_date, value= '20 November 2019']
    calendar = driver.find_element_by_name('ui-datepicker-calendar')
    search_field_adult = driver.find_element_by_id('select2-adult-4a-container')
    select.select_by_title('2')
    #look for search button and click it
    search_field_button = driver.find_element_by_class_name('search-avaibility').click()
Пример #2
0
    def wait_for_presence(self, selector='', **kwargs):
        '''
        Wait for an element to be present. (Does not need to be visible.)

        Parameters
        ----------
        selector: str
            A CSS selector to search for. This can be any valid CSS selector.

        kwargs:
            Passed on to _wait_for

        '''
        self._wait_for(EC.presence_of_element_located((By.CSS_SELECTOR, selector)) or
                       EC.presence_of_elements_located((By.CSS_SELECTOR, selector)),
                       **kwargs)
Пример #3
0
 def process_request(self, request, spider):
     self.logger.debug('PhantomJS is Starting')
     page = request.meta.get('page', 1)
     try:
         self.browser.get(request.url)
         if page > 1:
             input = self.wait.until(
                 EC.presence_of_elements_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input')))
             submit = self.wait.until(
                 EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit')))
             input.clear()
             input.send_keys(page)
             submit.click()
         self.wait.until(
             EC.text_to_be_present_in_element((
                 By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page)))
         return HtmlResponse(url=request.url, body=self.browser.page_source,
                             request=request, encoding='utf-8', status=200)
     except TimeoutException:
         return HtmlResponse(url=request.url, status=500, request=request)
 def find_elements(self, locator, timeout=10):
     "定位一组元素,传递一个元祖类型,如('id','//[@class='one']')"
     elements = WebDriverWait(self.driver, timeout, 1).until(
         EC.presence_of_elements_located(locator))
     return elements
Пример #5
0
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

browser = webdriver.Chrome()
try:
    browser.get("https://www.baidu.com")
    input = browser.find_element_by_id('kw')
    input.send_keys('python')
    input.send_keys(Keys.ENTER)
    wait = WebDriverWait(browser, 10)
    wait.until(EC.presence_of_elements_located((By.ID, 'content_left')))
    print(browser.current_url)
    print(browser.get_cookies())
    print(browser.page_source)
finally:
    browser.close()
Пример #6
0
    def article_title(self, pages, related):

        for page_id in pages:

            if page_id == "jes":
                pass
                # print("in jes")
                page_content = requests.get(pages[page_id]).content
                self.soup = BeautifulSoup(page_content, 'html.parser')
                lists = [
                    i for i in self.soup.find("form", attrs={
                        "action": "/gca"
                    }).findAll("div")
                    if i.attrs["class"].count("toc-level") > 0
                ]
                titles_jes = []
                titles_jes_html = []
                for i in lists:
                    data_extracted = {}

                    lists2 = [
                        j for j in i.findAll("div")
                        if j.attrs["class"].count("toc-level") > 0
                    ]

                    if lists2:
                        if i.find("h2"):
                            data_extracted.update({i.find("h2").text: {}})
                        # count_q = 0
                        for q in lists2:
                            if q.find("h3"):
                                data_extracted[i.find("h2").text].update(
                                    {q.find("h3").text: []})
                                for l in q.findAll(
                                        "h4",
                                        attrs={"class", "cit-title-group"}):
                                    data_extracted[i.find("h2").text][q.find(
                                        "h3").text].append(
                                            l.text.strip().replace("\n", ""))
                                    titles_jes.append(l.text.strip().replace(
                                        "\n", ""))
                                    titles_jes_html.append(l)
                    else:
                        # print("in h2 jes")
                        if i.find("h2"):
                            data_extracted.update({i.find("h2").text: []})
                            for l in i.findAll(
                                    "h4", attrs={"class", "cit-title-group"}):
                                data_extracted[i.find("h2").text].append(
                                    l.text.strip().replace("\n", ""))
                                titles_jes.append(l.text.strip().replace(
                                    "\n", ""))
                                titles_jes_html.append(l)
                    # data.update(data_extracted)
                # print(data)
                self.save_as_csv(titles_jes, pages[page_id], related)
                self.save_as_csv(titles_jes_html,
                                 pages[page_id],
                                 related,
                                 other=True)
                # print(titles_jes)
                # print("end of jes")

            elif page_id == "iopscience":
                pass
                # print("in iopscience")
                page_content = requests.get(pages[page_id]).content
                self.soup = BeautifulSoup(page_content, 'html.parser')
                lists_article_title = [
                    i.text.strip() for i in self.soup.findAll(
                        "a", attrs={"class", "art-list-item-title"})
                ]
                lists_article_title_html = [
                    i for i in self.soup.findAll(
                        "a", attrs={"class", "art-list-item-title"})
                ]
                # lists_authors = [i.text.strip() for i in self.soup.findAll("p",attrs={"class","small art-list-item-meta"})]
                # abstract_text = [i.find("p").text for i in self.soup.findAll("div",attrs={"class","article-text wd-jnl-art-abstract cf"})]
                # pdfs_link = [i.findAll("a")[2].attrs["href"] for i in self.soup.findAll("div",attrs={"class","art-list-item-tools small"}) if i.findAll("a",attrs={"class","mr-2 nowrap"}) ]
                # oa_or_not = []
                # for i in self.soup.findAll("div",attrs={"class","eyebrow"}):
                #     if i.findAll("a",attrs={"class","mr-2 nowrap"}):
                #         oa_or_not.append({"OA":True})
                #     else:
                #         oa_or_not.append({"OA":False})
                # print(oa_or_not)
                # print(lists_article_title)
                self.save_as_csv(lists_article_title, pages[page_id], related)
                self.save_as_csv(lists_article_title_html,
                                 pages[page_id],
                                 related,
                                 other=True)
                # print("end of iopscience")

            elif page_id == "scrip":
                # driver_scrip = webdriver.Chrome(executable_path="/Users/dhaneesh.gk/Projects/own/web_import/extract_it/drivers/chromedriver")
                driver_scrip = self.chrome_driver
                driver_scrip.get(pages[page_id])
                time.sleep(5)
                driver_scrip.refresh()
                WebDriverWait(driver_scrip, 20).until(
                    EC.presence_of_elements_located((
                        By.XPATH,
                        "//ul[div[contains(@id,'JournalInfor_Repeater_Papers')]]/p/a[@name]",
                        "DOM content are not accessible right now")))
                article_titles_scrip = [
                    i.text.strip()
                    for i in driver_scrip.find_elements_by_xpath(
                        "//ul[div[contains(@id,'JournalInfor_Repeater_Papers')]]/p/a[@name]"
                    )
                ]
                if not article_titles_scrip:
                    article_titles_scrip.append(
                        "Titels are not accessible from website right now")
                self.save_as_csv(article_titles_scrip, pages[page_id], related)

            elif page_id == "sciencedirect":
                pass
                page_content = requests.get(pages[page_id]).content
                self.soup = BeautifulSoup(page_content, 'html.parser')
                article_titels = []
                article_titles_html = []
                for i in self.soup.findAll(
                        "h3", attrs={"class", "text-m u-display-inline"}):
                    for j in i.findAll("span"):
                        if j.attrs:
                            if article_titels.count(j.text) == 0:
                                article_titels.append(j.text)
                                article_titles_html.append(j)
                # print(article_titels)
                self.save_as_csv(article_titels, pages[page_id], related)
                self.save_as_csv(article_titles_html,
                                 pages[page_id],
                                 related,
                                 other=True)

            elif page_id == "jsac":
                pass
                page_content = requests.get(pages[page_id]).content
                self.soup = BeautifulSoup(page_content, 'html.parser')
                articles = []
                article_titels = []
                article_titles_html = []
                for i in self.soup.findAll("div", attrs={"class", "article"}):
                    data_extracted = {}
                    title = i.find("div", attrs={"class", "title"}).text
                    title_html = i.find("div", attrs={"class", "title"})
                    authors = i.find("div", attrs={"class", "author"}).text
                    journal = i.find("div", attrs={"class", "journal"}).text
                    links = {
                        j.text: "http://www.jsac.or.jp" + j.attrs["href"]
                        for j in i.findAll("a") if "href" in j.attrs
                    }
                    image = "http://www.jsac.or.jp" + i.find(
                        "img").attrs["src"]
                    data_extracted.update({
                        "title": title,
                        "authors": authors,
                        "journal": journal,
                        "links": links,
                        "image": image
                    })
                    article_titels.append(title)
                    article_titles_html.append(title_html)
                    articles.append(data_extracted)
                # print(article_titels)
                self.save_as_csv(article_titels, pages[page_id], related)
                self.save_as_csv(article_titles_html,
                                 pages[page_id],
                                 related,
                                 other=True)