Exemplo n.º 1
0
def crawl_worker(agent_cfg, url_tuple):
    """Crawl given url. Will work in parallel. Cannot be class method."""
    MAX_SLEEP_BEFORE_JOB = 10 # prevent starting all parallel processes at the same instance
    sleep(random() * MAX_SLEEP_BEFORE_JOB) # sleep for a while
    
    try:
        idx, url = url_tuple
        idx = str(idx)
        
        stdout_log =  os.path.join(agent_cfg['job_dir'], fu.get_out_filename_from_url(url, str(idx), '.txt'))
       
        if not url[:5] in ('data:', 'http:', 'https', 'file:'):
            url = 'http://' + url
        
        proxy_opt = mitm.init_mitmproxy(stdout_log[:-4], agent_cfg['timeout'], agent_cfg['mitm_proxy_logs']) if agent_cfg['use_mitm_proxy'] else ""
        
        if not 'chrome_clicker' in agent_cfg['type']:
            cmd = get_visit_cmd(agent_cfg, proxy_opt, stdout_log, url)
            wl_log.info('>> %s (%s) %s' % (url, idx, cmd))
            status, output = ut.run_cmd(cmd) # Run the command
            if status and status != ERR_CMD_TIMEDOUT:
                wl_log.critical('Error while visiting %s(%s) w/ command: %s: (%s) %s' % (url, idx, cmd, status, output))
            else:
                wl_log.info(' >> ok %s (%s)' % (url, idx))
            
        else:
            cr.crawl_url(agent_cfg['type'], url, proxy_opt)
            
        sleep(2) # this will make sure mitmdump is timed out before we start to process the network dump
        if agent_cfg['post_visit_func']: # this pluggable function will parse the logs and do whatever we want
            agent_cfg['post_visit_func'](stdout_log, crawl_id=agent_cfg['crawl_id'])
            
    except Exception as exc:
        wl_log.critical('Exception in worker function %s %s' % (url_tuple, exc))
Exemplo n.º 2
0
 def test_should_timeout_onbeforeunload(self):
     test_url = cm.BASE_TEST_URL + "crawler/onbeforeunload.html"
     try:
         ut.timeout(cr.CRAWLER_CLICKER_VISIT_TIMEOUT + 2)
         cr.crawl_url("chrome_clicker", test_url, "")
     except ut.TimeExceededError as texc:
         self.fail("Crawl has timed out %s" % texc)
Exemplo n.º 3
0
def crawl_worker(agent_cfg, url_tuple):
    """Crawl given url. Will work in parallel. Cannot be class method."""
    MAX_SLEEP_BEFORE_JOB = 10 # prevent starting all parallel processes at the same instance
    sleep(random() * MAX_SLEEP_BEFORE_JOB) # sleep for a while
    
    try:
        idx, url = url_tuple
        idx = str(idx)
        
        stdout_log =  os.path.join(agent_cfg['job_dir'], fu.get_out_filename_from_url(url, str(idx), '.txt'))
       
        if not url[:5] in ('data:', 'http:', 'https', 'file:'):
            url = 'http://' + url
        
        proxy_opt = mitm.init_mitmproxy(stdout_log[:-4], agent_cfg['timeout'], agent_cfg['mitm_proxy_logs']) if agent_cfg['use_mitm_proxy'] else ""
        
        if not 'chrome_clicker' in agent_cfg['type']:
            cmd = get_visit_cmd(agent_cfg, proxy_opt, stdout_log, url)
            wl_log.info('>> %s (%s) %s' % (url, idx, cmd))
            status, output = ut.run_cmd(cmd) # Run the command
            if status and status != ERR_CMD_TIMEDOUT:
                wl_log.critical('Error while visiting %s(%s) w/ command: %s: (%s) %s' % (url, idx, cmd, status, output))
            else:
                wl_log.info(' >> ok %s (%s)' % (url, idx))
            
        else:
            cr.crawl_url(agent_cfg['type'], url, proxy_opt)
            
        sleep(2) # this will make sure mitmdump is timed out before we start to process the network dump
        if agent_cfg['post_visit_func']: # this pluggable function will parse the logs and do whatever we want
            agent_cfg['post_visit_func'](stdout_log, crawl_id=agent_cfg['crawl_id'])
            
    except Exception as exc:
        wl_log.critical('Exception in worker function %s %s' % (url_tuple, exc))
Exemplo n.º 4
0
    def __init__(self, html=' ', url=None, base_href=None, **extras):
        """
        init the class

        :param html: the html doc crawled
        :param url: if url param is set, the html param is ignored, and crawl the url page
        :param base_href: the base href settings in a page
        :param extras: extra settings here to add
        """
        if url is None:
            self.html = html
        else:
            crawled_page = crawl_url(url=url)
            self.html = crawled_page
        self.base_href = base_href
        self._parsed = build_html_tree(self.html, self.base_href)
        self.encode = get_encode_type(self.html)
        self.content_node = None
Exemplo n.º 5
0
def grid_search(ra_0, ra_1, dec_0, dec_1, interval):
    ra_0, ra_1 = sorted([ra_0, ra_1])
    dec_0, dec_1 = sorted([dec_0, dec_1])

    ra_length = int((ra_1 - ra_0) // interval)
    dec_length = int((dec_1 - dec_0) // interval)

    datafilename = 'data/%g-%g-%g-%g-%g.csv' % (ra_0, ra_1, dec_0, dec_1,
                                                interval)

    if (os.path.isfile(datafilename)):
        print('File already exists.')
        return datafilename

    f = open(datafilename, 'w')
    f.write('ObjectNo, ra, dec, type, u, g, r, i, z\n')

    ObjectList = []
    num = ra_length * dec_length

    print('Started crawling %d spots...' % (num))
    for ra_i in range(ra_length):
        for dec_i in range(dec_length):
            ra = ra_0 + interval * ra_i
            dec = dec_0 + interval * dec_i
            data = crawl_url(ra, dec, 0.5)
            if (data == 0):
                continue
            else:
                if (data[3] == 'STAR'):
                    if (data[0] not in ObjectList):  # to avoid duplication
                        ObjectList.append(data[0])
                        f.write(", ".join(data) + "\n")

            x = 1 + dec_i + ra_i * dec_length
            if (x % 10 == 0):
                print('Crawled %d out of %d spots' % (x, num))

    print('Done!')

    return datafilename
Exemplo n.º 6
0
    def work(self, should_continue):
        if not should_continue():
            return

        context = zmq.Context()
        with pull_socket(context, self._from_bind) as source, \
                 push_socket(context, self._to_bind) as destination:
            poller = zmq.Poller()
            poller.register(source, zmq.POLLIN)
            poller.register(destination, zmq.POLLOUT)

            while should_continue():
                url, data = None, None
                presults = dict(poller.poll(timeout=10))
                if len(presults) == 2:
                    try:
                        url, data = source.recv_multipart(zmq.NOBLOCK)
                        self.log.info([url, data])
                    except zmq.ZMQError as ze:
                        self.log.error("something bad happened maybe\n\n %s" % str(ze))

                    if url and data:
                        response = None
                        try:
                            data = json.loads(data)
                            response = crawl_url(url=url,
                                                 etag=data["etag"],
                                                 last_modified=data["last_modified"])
                            if response:
                                self.log.info("got response for %s" % url)
                        except Exception:
                            self.log.error("could not crawl %s" % url)

                        if response:
                            try:
                                self.log.info("sending to destination....")
                                destination.send_multipart([url, str(response)])
                            except zmq.ZMQError as ze:
                                self.log.error("could not send result of crawl of %s \n\n %s" % (url, str(ze)))
                    self.log.info("did something...")
                    self.log.info([presults, source in presults, destination in presults])
Exemplo n.º 7
0
from crawler import crawl_url, crawl_news

# 금융(259) 카테고리 2019년 07월 03일 1페이지 url 수집
urlDatas = crawl_url(259, '20190701', 1)
print('작성 언론사 : ' + urlDatas[1]['press'])
print('url : ' + urlDatas[1]['url'])
url = urlDatas[3]['url']

newsData = crawl_news(url)

# 뉴스 제목
title = newsData['title']
print(title)
print("\n")

# 출판 날짜
publishedAt = newsData['publishedAt']

# 썸네일 URL
thumbnail = newsData['thumbnail']

# 뉴스 본문
content = newsData['content']

# 네이버 뉴스 요약문 베타
summary = newsData['summary']

# 좋아요, 훈훈해요, 슬퍼요, 화나요, 후속기사 원해요
like = newsData['like']
warm = newsData['warm']
sad = newsData['sad']
Exemplo n.º 8
0
categoryList = [
    264, 265, 268, 266, 267, 269, 259, 258, 261, 771, 260, 262, 310, 263, 249,
    250, 251, 254, 252, '59b', 255, 256, 276, 257, 241, 239, 240, 237, 238,
    376, 242, 243, 244, 248, 245, 231, 232, 233, 234, 322, 731, 226, 227, 230,
    732, 283, 229, 228
]

for category in categoryList:
    startDate = 20190601
    endDate = 20190602
    page = 1
    crawledUrl = []
    while True:
        if startDate == endDate:
            break
        urlDatas = crawl_url(category, str(startDate), page)

        for urlData in urlDatas:
            ID = str(startDate) + randomString()
            try:
                url = urlData['url']
                if url in crawledUrl:
                    page = 20
                    break
                else:
                    crawledUrl.append(url)
                    newsData = crawl_news(url)
                    title = newsData['title']
                    content = newsData['content']
                    like = newsData['like']
                    angry = newsData['angry']