Пример #1
0
def crawl():
    PATIENCE = 15
    MAX_RETRY = 3
    SOURCE_NAME = "조선일보"
    MAIN_URL = "http://www.chosun.com/"

    driver = util.get_driver()
    driver.get(MAIN_URL)
    driver.set_page_load_timeout(PATIENCE)
    article_link_patterns = ["news.chosun.com/site/data/html_dir/"]
    link_list = []
    timeout_cnt = 0
    skipped_cnt = 0

    # href_elms = driver.find_elements_by_class_name("sec_con")[1].find_elements_by_css_selector("[href]")
    href_elms = WebDriverWait(driver, PATIENCE) \
        .until(EC.presence_of_all_elements_located((By.CLASS_NAME, "sec_con")))[1] \
        .find_elements_by_css_selector("[href]")
    # live_elms = driver.find_elements_by_css_selector("#today_live_con_id [href]")
    live_elms = WebDriverWait(driver, PATIENCE) \
        .until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#today_live_con_id [href]")))
    href_elms += live_elms

    for i in href_elms:
        href = i.get_attribute("href")
        for p in article_link_patterns:
            if p in href:
                try:
                    link_list.index(href)
                except ValueError:
                    link_list.append(href)
                break

    print("%d articles found" % len(link_list))

    for i in link_list:
        # Retry loop
        for retry in range(0, 3):
            try:
                article = extract_news(driver, i)
                if article is not None:
                    util.post(article, SOURCE_NAME)
                    break
                else:
                    continue
            except (TimeoutException, NoSuchElementException,
                    StaleElementReferenceException):
                if retry == MAX_RETRY - 1:
                    skipped_cnt += 1
                else:
                    driver.refresh()
                    timeout_cnt += 1

    driver.quit()

    print("Done with %d timeouts and %d skipped pages in %d links" %
          (timeout_cnt, skipped_cnt, len(link_list)))
Пример #2
0
def crawl():
    PATIENCE = 15
    MAX_RETRY = 3
    SOURCE_NAME = "동아일보"
    MAIN_URL = "http://www.donga.com/"

    driver = util.get_driver()
    driver.set_page_load_timeout(PATIENCE)
    driver.get(MAIN_URL)

    INCLUDE_URLS = ["news.donga.com/Main", "news.donga.com/MainTop"]
    article_links = []
    timeout_cnt = 0
    skipped_cnt = 0

    href_elms = driver.find_elements_by_css_selector("[href]")

    for e in href_elms:
        href = e.get_attribute("href")
        for i in INCLUDE_URLS:
            if i in href:
                article_links.append(href)
                break

    print("%d articles found" % len(article_links))

    for i in article_links:
        for retry in range(0, MAX_RETRY):
            try:
                article = extract(driver, i)
                util.post(article, SOURCE_NAME)
                break
            except (TimeoutException, NoSuchElementException,
                    StaleElementReferenceException):
                if retry == MAX_RETRY - 1:
                    skipped_cnt += 1
                else:
                    driver.refresh()
                    timeout_cnt += 1

    driver.quit()

    print("Done with %d timeouts and %d skipped pages in %d links" %
          (timeout_cnt, skipped_cnt, len(article_links)))
Пример #3
0
 def xmlrpc_get_driver(self, iface):
     """
     Returns driver version
     """
     return util.get_driver(iface)
Пример #4
0
from time import sleep

from util import get_driver

driver = get_driver()

# 1 获取手机分辨率,通常用于swipe方法完成滑动操作
size = driver.get_window_size()
print('分辨率:', size)

width = size.get('width')
height = size.get('height')

# 向下滑动
driver.swipe(width * 0.5, height * 0.8, width * 0.5, height * 0.2, 2000)

sleep(3)
driver.close_app()
# 向右滑动
driver.swipe(width * 0.8, height * 0.5, width * 0.2, height * 0.5, 2000)

sleep(3)
driver.quit()
Пример #5
0
 def xmlrpc_get_driver(self, iface):
     """
     Returns driver version
     """
     return util.get_driver(iface)
Пример #6
0
 def setup_class(self):
     # 声明我们的driver对象
     self.driver = get_driver()
Пример #7
0
#!/usr/bin/env python
# coding: utf-8

# In[1]:

from util import get_driver, read_lines, write_lines, dump_pickle, load_pickle
from pprint import pprint
from tqdm import tqdm
from collections import OrderedDict
import os

# In[2]:

driver = get_driver(browser='chrome', headless=True)

# In[3]:

entries = read_lines('roots.txt')
vocab = OrderedDict()
for entry in tqdm(entries):
    lst = entry.split()
    word = lst[1]
    # Ignore same word with different pos tag
    if word not in vocab:
        vocab[word] = [lst[0]] + lst[2:]

# In[4]:


def get_num_related(word, verbose=False):
    driver.get(f'https://www.merriam-webster.com/thesaurus/{word}')
Пример #8
0
def start_listener():
    global activities

    backup_reload(False)

    while service_on:
        with util.get_driver(True) as driver:
            insta_login(driver)

            x = 0
            while service_on and x < 500:
                if x == 0:
                    do_follows(driver)
                    x = 40
                else:
                    x = x - 1

                messages = load_messages(driver)

                for user, msgs in messages.items():
                    for msg in msgs:
                        if isinstance(msg, list) and len(msg) == 2:

                            activity_type = "sent a Text '{0}'".format(msg[1])
                            if msg[0] == "Image":

                                activity_type = "shared a photo. URL '{0}'".format(
                                    msg[1])
                                send_message(driver, user, [
                                    ":robot_face: {0} - Hübsches Foto :heart_eyes:, aber damit kann ich im Moment nichts anfangen :confounded:"
                                    .format(config.BOT_NAME)
                                ])
                            if msg[0] == "Video":

                                activity_type = "shared a video. URL '{0}'".format(
                                    msg[1])
                                send_message(driver, user, [
                                    ":robot_face: {0} - Super Video :ok_hand:, aber damit kann ich im Moment nichts anfangen :confounded:"
                                    .format(config.BOT_NAME)
                                ])
                            if msg[0] == "Post":

                                activity_type = "shared a Post or Story. URLs: '{0}'".format(
                                    msg[1])
                                if msg[1]:
                                    send_message(driver, user, [
                                        ":arrow_down: Post Downloader - hier sind die Links zu den Medien:"
                                    ])
                                    for url in msg[1]:
                                        send_message(driver, user, [url])
                                    send_message(driver, user, [
                                        ":muscle: powered by: @{0} :sunglasses:"
                                        .format(config.INSTA_USER)
                                    ])
                                else:
                                    send_message(driver, user, [
                                        ":warning: Post Downloader - keine Medien gefunden!"
                                    ])
                            if msg[0] == "Text":
                                if msg[1].startswith("!"):
                                    cmd = msg[1].split(" ")

                                    activity_type = "issued Command: '{0}' with Args: '{1}'".format(
                                        cmd[0], " ".join(cmd[1:]))
                                    process_command(driver, user, cmd[0],
                                                    cmd[1:])

                            log_activity("[{0}] '{1}' {2}".format(
                                time.ctime(time.time()), user, activity_type))

                util.append_list(activities,
                                 os.path.join('data', 'activities_log'))
                activities = []

                time.sleep(random.uniform(5, 10))
            backup_reload(True)
Пример #9
0
from tqdm import tqdm
import json
import os
from util import get_driver, read_lines, dump_pickle

URL = "https://www.collinsdictionary.com/us/dictionary/english/"
XPATH = {
    "pos": '//span[@class="gramGrp pos"]',
    "meaning": '//div[@class="def"]',
    "sent": '//div[@class="cit type-example quote"]',
}

lines = read_lines('sorted_importance.txt')

driver = get_driver(headless=True)
if os.path.exists('book.json'):
    with open('book.json') as fp:
        dictionary = json.load(fp)
else:
    dictionary = {}

for i, line in enumerate(tqdm(lines)):
    word, freq, use = line.split()[:3]
    if word not in dictionary:
        dictionary[word] = {
            'freq': freq,
            'use': use,
        }
        driver.get(URL + word.lower())
        for key, xpath in XPATH.items():
            try:
Пример #10
0
def crawl():
    PATIENCE = 15
    MAX_RETRY = 3
    SOURCE_NAME = "중앙일보"
    MAIN_URL = "https://joongang.joins.com/"
    INCLUDE_URLS = [
        "news.joins.com/article"
    ]
    inclusion_filtered = []
    link_list = []
    timeout_cnt = 0
    skipped_cnt = 0

    driver = util.get_driver()
    driver.set_page_load_timeout(PATIENCE)

    done = False
    for r in range(0, PATIENCE):
        try:
            driver.get(MAIN_URL)
            done=True
            # href_elms = driver.find_elements_by_css_selector("[href]")
            href_elms = WebDriverWait(driver, PATIENCE) \
                .until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[href]")))
        except TimeoutException:
            continue

    if not done:
        driver.quit()
        sys.exit(1)

    for i in href_elms:
        href = i.get_attribute("href")

        for j in INCLUDE_URLS:
            if j in href:
                inclusion_filtered.append(href)
                break

    # 링크 정제 과정
    for href in inclusion_filtered:

        if "?" in href:
            href = href.split("?")[0]

        if href is not None:
            try:
                link_list.index(href)
            except ValueError:
                # 추가할 링크가 리스트에 없는 경우 => 중복되지 않는 경우
                link_list.append(href)

    print("%d articles found" % len(link_list))

    for i in link_list:
        for retry in range(0, MAX_RETRY):
            try:
                article = extract(driver=driver, url=i)
                if article is not None:
                    util.post(article, SOURCE_NAME)
                break
            except (TimeoutException, NoSuchElementException, StaleElementReferenceException):
                if retry == MAX_RETRY - 1:
                    skipped_cnt += 1
                else:
                    try:
                        driver.refresh()
                    except TimeoutException:
                        skipped_cnt += 1
                        continue
                    timeout_cnt += 1

    driver.quit()
    print("Done with %d timeouts and %d skipped pages in %d links" % (timeout_cnt, skipped_cnt, len(link_list)))