예제 #1
0
 def get_text_by_base_url(self):
     robots = RobotsCache(capacity=100)
     if not robots.allowed(self.base_url, "python-requests"):
         return ["Crawling this site is not allowed by robots.txt"]
     text_list = []
     for slug in self.__get_links_by_url_depth():
         sleep(0.5)
         text_list.append(
             remove_emoji(
                 remove_url(self.__get_text_by_url(self.base_url +
                                                   slug))).strip())
     return text_list
예제 #2
0
 def get_tweets(self):
     all_tweets = []
     try:
         for new_tweet in self.__limit_handled(
                 tweepy.Cursor(self.api.user_timeline,
                               screen_name=SCREEN_NAME,
                               include_rts=False,
                               exclude_replies=True).items()):
             all_tweets.append(new_tweet)
     except RuntimeError as e:  # RuntimeError: generator raised StopIteration
         print(e)
     print("Tweet Num {}".format(len(all_tweets)))
     return [remove_emoji(x.text) for x in all_tweets]
예제 #3
0
def parse_company_page(content=None, proxy=None):
    logger = utils.get_logger()
    document = etree.HTML(content.decode('utf-8'))
    company_page_url = document.xpath('//p[@class="company-name-t"]/a/@href')
    if company_page_url:
        company_page_url = company_page_url[0]
        company_page = download_page(url=company_page_url,
                                     method='get',
                                     proxy=proxy)
        logger.info('parse_company_page 获取公司信息成功 %s ' % company_page_url)
        if company_page['code'] == 0:
            try:
                company_page_ = utils.remove_emoji(
                    company_page['data']).decode('utf-8', 'ignore')
                return company_page_
            except Exception as e:
                logger.error('公司页面编码失败 %s ' % company_page_url)
    else:
        logger.error('parse_company_page 没有解析到公司详情页面url')
    return ''
예제 #4
0
def group_tiplines_by_language(
        tip_line_requests,
        languages=['en', 'pt', 'hi', 'mr', 'bn', 'ta', 'te', 'ml']):
    for tip in tip_line_requests:
        tip['text'] = remove_emoji(
            tip['media_text'] if tip['media_text'] != 'NA'
            and len(tip['media_text']) >= len(tip['media_title']) else
            tip['media_title'])
        lang_data = cld3.get_language(tip['text'])
        if lang_data is not None and lang_data.probability >= 0.95:
            tip['language'] = lang_data.language
    tip_line_requests = [
        tip for tip in tip_line_requests if tip['text'] != 'NA'
        and not tip['text'].isspace() and 'language' in tip and (
            60 <= len(tip['text']) <= 1200) and not contains_url(tip['text'])
        and not contains_phone_number(tip['text'])
    ]

    return [{
        'text': item['text'],
        'language': item['language'],
        'source': SourceName.TIPLINE.value
    } for item in tip_line_requests if item['language'] in languages]
예제 #5
0
 def get_trends_tokyo(self):
     return [
         remove_emoji(x["name"])
         for x in self.api.trends_place(1118285)[0]["trends"]
     ]
예제 #6
0
def save_db(page_index=None,
            page_num=None,
            page_content=None,
            page_url=None,
            city_name=None,
            jd_layout_time=None,
            data=None,
            company_page=None,
            proxy=None,
            pay_data=None):
    logger = utils.get_logger()
    flg = False
    '''
    保存原文到数据库
    '''
    try:
        # url = data['url']
        city_url = data['cityUrl']
        # data['url'] = ''
        data['cityUrl'] = ''
        track_id = str(uuid.uuid4())
        if proxy:
            proxy = proxy['http'].replace('http://', '')
        page_content = utils.remove_emoji(page_content).decode(
            'utf8', 'ignore')
        if not pay_data:
            pay_data = {}
        pay_data['content'] = page_content
        sql = 'INSERT INTO jd_raw(trackId,source,content,createTime,createBy,ip,pageNum,pageIndex,pageUrl,jobCity,searchConditions,jdLayoutTime)VALUES(%s,"ZHI_LIAN",%s,now(),"python",%s,%s,%s,%s,%s,%s,%s)'
        sql_val = [
            uuid.uuid1(),
            json.dumps(pay_data, ensure_ascii=False), proxy, page_num,
            page_index, page_url,
            city_name.decode('utf-8', 'ignore'),
            json.dumps(data, ensure_ascii=False), jd_layout_time
        ]
        # data['url'] = url
        data['cityUrl'] = city_url
        kafka_data = {
            "channelType": "WEB",
            "content": {
                "trackId": track_id,
                "content": json.dumps(pay_data, ensure_ascii=False),
                "id": '',
                "createTime": int(time.time() * 1000),
                "createBy": "python",
                "ip": proxy,
                "phoneUrl": '',
                "ocrImg": '',
                "jdLayoutTime": jd_layout_time,
                "pageUrl": page_url,
                "pageNum": page_num,
                "pageIndex": page_index,
                "jobCity": city_name.decode('utf-8', 'ignore'),
                "companyImgs": '',
                "source": "ZHI_LIAN",
                "searchConditions": json.dumps(data, ensure_ascii=False),
                "contactInfo": company_page.decode('utf-8', 'ignore'),
            },
            "interfaceType": "PARSE",
            "resourceDataType": "RAW",
            "resourceType": "JD_SEARCH",
            'protocolType': 'HTTP',
            "source": "ZHI_LIAN",
            "trackId": track_id,
        }
        utils.save_data(sql, sql_val, kafka_data)
        logger.info('保存数据库成功')
        flg = True
    except Exception:
        logger.error('保存数据库异常 %s %s' % (page_url, traceback.format_exc()))
        flg = False
    return flg
예제 #7
0
    def get_post_info(self):

        keyword = ''
        for i, word in enumerate(self.args.keyword):
            if i == 0:
                keyword += word
            else:
                keyword += "+" + word

        # db 사용 시
        # if self.args.is_db:
        #     row_id = self.db_model.set_daily_log(keyword, 1)

        start_url = "https://www.youtube.com/results?search_query="

        # search term setting
        term_dict = {
            'lasthour': 'AQ',
            'today': 'Ag',
            'thisweek': 'Aw',
            'thismonth': 'BA',
            'thisyear': 'BQ'
        }

        # 키워드 및 기간 설정 url 생성
        try:
            start_url = start_url + keyword + f'&sp=EgII{term_dict[self.args.choose_period]}%253D%253D'
        except ValueError:
            print('choose lasthour, today, thisweek, thismonth, thisyear')

        chrome_options = wd.ChromeOptions()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        driver = wd.Chrome(
            executable_path="/Users/oldman/Documents/crawler/chromedriver",
            chrome_options=chrome_options)
        driver.get(start_url)
        # 페이지 높이 설정
        last_page_height = driver.execute_script(
            "return document.documentElement.scrollHeight")

        # 스크롤 끝까지 내리고 이전 시점과 스크롤 높이가 같다면 멈춤
        # 스크롤 횟수 argparser 추가
        while True:
            self.args.post_scroll_num -= 1
            driver.execute_script(
                "window.scrollTo(0, document.documentElement.scrollHeight);")
            time.sleep(3)
            # Scroll down to bottom
            new_page_height = driver.execute_script(
                "return document.documentElement.scrollHeight")

            if new_page_height == last_page_height:
                break
            elif self.args.post_scroll_num == 0:
                break

            last_page_height = new_page_height

        print('Post url crawling complete.')
        html_source = driver.page_source

        soup = BeautifulSoup(html_source, 'lxml')

        for title in soup.select("a#video-title"):
            unique_id = title.get('href')[9:]
            driver.get("https://www.youtube.com/" + title.get('href'))
            print("https://www.youtube.com/" + title.get('href'))
            last_page_height = driver.execute_script(
                "return document.documentElement.scrollHeight")

            # RDBM log stack
            body_is_new = {
                'is_new': False,
                'last_time_update': '1970-01-01 00:00:00'
            }

            # comment scroll
            # 스크롤 횟수 argparser 추가
            while True:
                self.args.comment_scroll_num -= 1
                driver.execute_script(
                    "window.scrollTo(0, document.documentElement.scrollHeight);"
                )
                time.sleep(3.5)
                new_page_height = driver.execute_script(
                    "return document.documentElement.scrollHeight")

                if new_page_height == last_page_height:
                    break
                elif self.args.comment_scroll_num == 0:
                    break

                last_page_height = new_page_height

            print('Comment scroll complete.')

            html_source = driver.page_source
            soup = BeautifulSoup(html_source, 'lxml')

            # live streaming 여부 확인
            find_live = soup.select('div#date > yt-formatted-string')
            # waiting clip 여부 확인
            find_wait = soup.select(
                'div#count > yt-view-count-renderer > span')
            try:
                try:
                    live_tmp = str(find_live[0].text).replace(
                        '\n', '').replace('\t',
                                          '').replace('              ', '')
                except ValueError:
                    live_tmp = 'No streaming'
            except IndexError:
                live_tmp = 'No streaming'

            try:
                try:
                    wait_tmp = str(find_wait[0].text).replace(
                        '\n', '').replace('\t',
                                          '').replace('              ', '')
                except ValueError:
                    wait_tmp = 'No Premieres'
            except IndexError:
                wait_tmp = 'No Premieres'

            live_comp = re.compile('Started streaming')
            wait_comp = re.compile('waiting')
            live_match = live_comp.match(live_tmp)
            wait_match = wait_comp.match(wait_tmp)

            if live_match:
                print('This is streaming')  # live일 경우
            elif wait_match:
                print('This is Premieres')  # waiting clip일 경우
            else:  # 모두 아닐 경우
                soup = BeautifulSoup(html_source, 'lxml')

                clip_content = soup.select(
                    '#description > yt-formatted-string')
                contents = ['']

                for content in clip_content:
                    str_tmp = utils.remove_emoji(
                        str(content.text).replace('\n', '').replace(
                            '\t', '').replace('              ', ''))
                    contents = contents[0] + ' ' + str_tmp

                # 댓글 개수
                print('comment_num is: ',
                      len(soup.select('#author-text > span')))

                try:
                    like_count = utils.remove_str(
                        soup.find(
                            "yt-formatted-string", {
                                "class":
                                "style-scope ytd-toggle-button-renderer style-text"
                            })["aria-label"][0])
                except KeyError:
                    like_count = 0

                try:
                    dislike_count = utils.remove_str(
                        soup.find(
                            "yt-formatted-string", {
                                "class":
                                "style-scope ytd-toggle-button-renderer style-text"
                            })["aria-label"][1])
                except KeyError:
                    dislike_count = 0

                try:
                    posting_date = datetime.strptime(
                        soup.select('#date > yt-formatted-string')[0].text,
                        "%Y-%m-%d").strftime("%Y-%m-%d %H:%M:%S")
                except ValueError:
                    posting_date = utils.conv_date2(
                        soup.select('#date > yt-formatted-string')[0].text)

                view_count = utils.remove_str(
                    soup.select(
                        '#count > ytd-video-view-count-renderer > span.view-count.style-scope.ytd-video-view-count-renderer'
                    )[0].text.replace('\n', '').replace('\t', '').replace(
                        '조회수 ', '').replace('회', ''))
                view_comp = re.compile('대기 중')
                view_match = view_comp.match(str(view_count))

                if view_count == '조회수 없음':
                    view_count = 0
                elif view_match:
                    continue

                # post information
                post_dict = {
                    "unique_id":
                    unique_id,
                    "keyword":
                    keyword,
                    "title":
                    utils.addslashes(
                        soup.select('#container > h1 > yt-formatted-string')
                        [0].text.replace('\n', '').replace('\t', '').replace(
                            '              ', '')),
                    "user_id":
                    0,
                    "user_name":
                    utils.addslashes(
                        soup.select('#text > a')[0].text.replace(
                            '\n',
                            '').replace('\t',
                                        '').replace('              ', '')),
                    "posting_date":
                    posting_date,
                    "view_count":
                    view_count,
                    "like_count":
                    utils.conv_digit(like_count),
                    "dislike_count":
                    utils.conv_digit(dislike_count),
                    "contents":
                    utils.addslashes(contents),
                    "user_follow":
                    0,
                    "user_follower":
                    utils.conv_digit(
                        soup.select('yt-formatted-string#owner-sub-count')
                        [0].text),
                    "user_medias":
                    0,
                    "comment_count":
                    len(soup.select('#author-text > span'))
                }

                # if self.args.is_db:
                #     body_is_new = self.db_model.set_data_body(1, post_dict)
                self.post_list.append(post_dict)

            # comment information
            for i in range(len(soup.select('#author-text > span'))):

                try:
                    comment_like = utils.remove_str(
                        soup.find(
                            "span", {
                                "class":
                                "style-scope ytd-comment-action-buttons-renderer"
                            })["aria-label"][i])
                except (IndexError, KeyError):
                    comment_like = 0

                try:
                    comment_dict = {
                        "unique_id":
                        unique_id,
                        "keyword":
                        keyword,
                        "user_name":
                        utils.addslashes(
                            soup.select('#author-text > span')[i].text.replace(
                                '\n', '').replace('\t', '').replace(
                                    '                ',
                                    '').replace('              ', '')),
                        "comment_date":
                        utils.conv_date(
                            soup.select(
                                '#header-author > yt-formatted-string > a')
                            [i].text).strftime("%Y-%m-%d %H:%M:%S"),
                        "comment":
                        utils.addslashes(
                            soup.select('#content-text')[i].text.replace(
                                '\n', '').replace('\t', '').replace(
                                    '                ',
                                    '').replace('              ', '')),
                        "comment_like":
                        utils.conv_digit(comment_like)
                    }

                except IndexError:
                    comment_dict = {
                        "unique_id":
                        unique_id,
                        "keyword":
                        keyword,
                        "user_name":
                        utils.addslashes(
                            soup.select('#author-text > span')[i].text.replace(
                                '\n        ', '').replace('\t', '').replace(
                                    '                ',
                                    '').replace('              ', '')),
                        "comment_date":
                        utils.addslashes(
                            soup.select(
                                '#header-author > yt-formatted-string > a')
                            [i].text).strftime("%Y-%m-%d %H:%M:%S"),
                        "comment":
                        utils.addslashes(
                            soup.select('#content-text')[i].text.replace(
                                '\n        ', '').replace('\t', '').replace(
                                    '                ',
                                    '').replace('              ', '')),
                        "comment_like":
                        utils.conv_digit(comment_like)
                    }

                # RDBM 사용시 활성화
                # if self.args.is_db:
                #     self.db_model.set_data_comment(1, comment_dict, body_is_new['is_new'],
                #                                    body_is_new['last_time_update'])
                self.comment_list.append(comment_dict)

        driver.quit()
        # if self.args.is_db:
        #     self.db_model.set_daily_log('', '', row_id)
        print("Done")
        print(f'Crawled post num: {len(self.post_list)}\n'
              f'Crawled comment num: {len(self.comment_list)}')

        return self.post_list, self.comment_list
예제 #8
0
파일: process.py 프로젝트: logonmy/Spider-1
def save_db(page_content=None,
            proxy=None,
            city_name=None,
            phone_url=None,
            jd_layout_time=None,
            page_url=None,
            company_img=None,
            page_num=None,
            page_index=None,
            post_data=None,
            pay_data=None):
    logger = utils.get_logger()
    try:
        # print post_data
        city_url = post_data['cityUrl']
        func_url = post_data['funcUrl']
        # city url跟funcurl 不入raw表
        post_data['cityUrl'] = ''
        post_data['funcUrl'] = ''
        page_content = utils.remove_emoji(page_content).decode(
            'utf8', 'ignore')
        pay_data['content'] = page_content
        track_id = str(uuid.uuid4())
        sql = 'insert into jd_raw(trackId,source,content,createTime,createBy,ip,pageNum,pageIndex,pageUrl,jobCity,jdLayoutTime,ocrImg,searchConditions)values(%s,"GJ_HR",%s,now(),"python",%s,%s,%s,%s,%s,%s,%s,%s)'
        # search_conditions = {'post_count': post_data['post_count'], 'resume_count': post_data['resume_count'],'':post_data}
        sql_val = [
            track_id,
            json.dumps(pay_data, ensure_ascii=False),
            proxy["http"].replace("http://", ""), page_num, page_index,
            page_url, city_name, jd_layout_time, phone_url,
            json.dumps(post_data, ensure_ascii=False)
        ]
        post_data['cityUrl'] = city_url
        post_data['funcUrl'] = func_url
        kafka_data = {
            "channelType": "WEB",
            "content": {
                "trackId": track_id,
                "content": json.dumps(pay_data, ensure_ascii=False),
                "id": '',
                "createTime": int(time.time() * 1000),
                "createBy": "python",
                "ip": proxy["http"].replace("http://", ""),
                "ocrImg": phone_url,
                "jdLayoutTime": jd_layout_time,
                "pageUrl": page_url,
                "pageNum": page_num,
                "pageIndex": page_index,
                "companyImgs": company_img,
                "source": "GJ_HR",
                "jobCity": city_name,
                "searchConditions": json.dumps(post_data, ensure_ascii=False)
            },
            "interfaceType": "PARSE",
            "resourceDataType": "RAW",
            "resourceType": "JD_SEARCH",
            'protocolType': 'HTTP',
            "source": "GJ_HR",
            "trackId": track_id,
        }
        flg = utils.save_data(sql, sql_val, kafka_data)
        logger.info('入库成功 %s ', str(flg))
    except Exception as e:
        logger.error(traceback.format_exc())