예제 #1
0
def posts_index():
    driver = driver_facebook()
    driver.get(
        "https://www.facebook.com/groups/southmongoliasupport//?ref=direct")
    time.sleep(2)
    execute_times(driver, 2000)
    posts_html = driver.page_source

    with open("posts_index.html", "w", encoding='utf-8') as f:
        f.write(posts_html)
    log('posts_html 写入文件夹')
    return posts_html
예제 #2
0
def personal_data(index_html):
    data_sex = re.findall(r'"addFriendText".*?<', index_html) or re.findall(
        r'<span class="FollowLink">.*?</span>',
        index_html)
    log('data_sex', data_sex)
    post = PostData()
    if len(data_sex) != 0:
        if '他' in data_sex[0]:
            post.sex = 'man'
        if "她" in data_sex[0]:
            post.sex = "woman"

    profile = re.findall(r'<div id="intro_container_id">.*?</ul></div>', index_html)
    if profile == []:
        global error_count
        error_count += 1
        log("error 第{}次, not find profile".format(error_count))

    e = pq(profile[0])
    all_profile = e.text()
    list_profile = all_profile.split("\n")
    log('list_profile', list_profile)
    for item in list_profile:
        if ("曾经" in item or '就读于' in item) and post.degree == '':
            post.degree = item
        elif "所在地" in item:
            post.location = item
        elif "来自" in item:
            post.come_form = item
        elif "粉丝" in item:
            post.followers = item
        elif "-" in item and post.job == '' and '曾经' not in item:
            post.job = item
    log('post', post)
    return post
예제 #3
0
    def run(self):
        # self.set_name()
        # while True:
        account_list = [
            '晚聊伴夜', '氢氪财经', '菲迪克智慧工程企业管理平台', '山西同乡群', '筱猫影视', '沈阳南动车运用所',
            '潇湘茶', '众智睿赢企业管理咨询有限公司', '微景相册', '书悦堂', '分享好宝贝', '民艺旅舍', '女王Dcup',
            '轻松定位美丽', '乐清市红辣椒越剧艺苑', '畅舞馆', '人禾健康产业', '常州格物斯坦机器人创客中心', '千秋妃子',
            '崇左航博'
        ]

        for name in account_list:
            self.name = name
            html_account = self.account_homepage()
            if html_account:
                html, account_of_homepage = html_account
            else:
                continue
            log('start 公众号: ', self.name)
            urls_article = self.urls_article(html)

            account = Acount()
            account.name = self.name
            account.account = account_of_homepage
            account.get_account_id()

            backpack_list = []
            for page_count, url in enumerate(urls_article):
                # if page_count < 35:
                #     continue
                article = Article()
                article.create(url, self.name)
                log('文章标题:', article.title)
                log("第{}条".format(page_count))

                entity = JsonEntity(article, account)
                backpack = Backpack()
                backpack.create(entity)
                backpack_list.append(backpack.create_backpack())

                import pymongo
                conn = pymongo.MongoClient('mongo')
                # 上传数据库
                sql = '''   
                        INSERT INTO 
                            account_http(article_url, addon, account, account_id, author, id, title) 
                        VALUES 
                            (%s, %s, %s, %s, %s, %s, %s)
                '''
                _tuple = (article.url, datetime.datetime.now(), entity.account,
                          entity.account_id, entity.author, entity.id,
                          entity.title)
                uploads_mysql(config_mysql, sql, _tuple)
                # if page_count == 5:
                #     break

        log("发包")
        if entity:
            entity.uploads(backpack_list)
예제 #4
0
파일: collect.py 프로젝트: whoiskx/com_code
    def run(self):
        # self.set_name()
        # while True:
        account_list = ['有看投',]
        entity = None
        backpack_list = []
        for name in account_list:
            self.name = name
            html_account = self.account_homepage()
            if html_account:
                html, account_of_homepage = html_account
            else:
                continue
            log('start 公众号: ', self.name)
            urls_article = self.urls_article(html)

            account = Account()
            account.name = self.name
            account.account = account_of_homepage
            account.get_account_id()
            # account.account_id = 126774646

            for page_count, url in enumerate(urls_article):
                # if page_count < 35:
                #     continue
                article = Article()
                article.create(url, account)
                log('文章标题:', article.title)
                log("第{}条".format(page_count))

                entity = JsonEntity(article, account)
                backpack = Backpack()
                backpack.create(entity)
                backpack_list.append(backpack.create_backpack())
                import pymongo
                conn = pymongo.MongoClient('mongo')
                # 上传数据库
                sql = '''
                        INSERT INTO
                            account_http(article_url, addon, account, account_id, author, id, title)
                        VALUES
                            (%s, %s, %s, %s, %s, %s, %s)
                '''
                _tuple = (
                    article.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id,
                    entity.title
                )
                uploads_mysql(config_mysql, sql, _tuple)
                if page_count == 4:
                    break

        log("发包")
        if entity:
            # entity.uploads(backpack_list)
            # entity.uploads_datacenter_relay(backpack_list)
            entity.uploads_datacenter_unity(backpack_list)
            print('end')
예제 #5
0
파일: collect.py 프로젝트: whoiskx/com_code
    def run(self):
        # self.set_name()
        # while True:
        account_list = global_account_list or [
            '刀口谈兵',
        ]

        for name in account_list:
            self.name = name
            html_account = self.account_homepage()
            if html_account:
                html, account_of_homepage = html_account
            else:
                continue
            log('start 公众号: ', self.name)
            urls_article = self.urls_article(html)

            account = Account()
            account.name = self.name
            account.account = account_of_homepage
            # account.get_account_id()
            account.account_id = global_account_id or 126776905

            entity = None
            backpack_list = []
            for page_count, url in enumerate(urls_article):
                if page_count == 0:
                    continue
                article = Article()
                article.create(url, self.name)
                log('文章标题:', article.title)
                log("第{}条".format(page_count))

                entity = JsonEntity(article, account)
                backpack = Backpack()
                backpack.create(entity)
                backpack_list.append(backpack.create_backpack())
                # 上传数据库
                sql = '''
                        INSERT INTO
                            account_http(article_url, addon, account, account_id, author, id, title)
                        VALUES
                            (%s, %s, %s, %s, %s, %s, %s)
                '''
                _tuple = (article.url, datetime.datetime.now(), entity.account,
                          entity.account_id, entity.author, entity.id,
                          entity.title)
                uploads_mysql(config_mysql, sql, _tuple)
                if page_count == 5:
                    break

            log("发包")
            if entity:
                entity.uploads(backpack_list)
                # entity.uploads_datacenter(backpack_list)
                print('end')
예제 #6
0
    def account_homepage(self):
        # 搜索并进入公众号主页
        search_url = self.url.format(self.name)
        resp_search = self.s.get(search_url, headers=self.headers, cookies=self.cookies)

        if 'class="b404-box" id="noresult_part1_container"' in resp_search.text:
            log("找不到该公众号: {}".format(self.name))
            return
        e = pq(resp_search.text)
        if e(".tit").eq(0).text() == self.name:
            account_link = e(".tit").find('a').attr('href')
        elif len(e(".tit").eq(0).text()) > 1:
            log("不能匹配正确的公众号: {}".format(self.name))
            return
        else:
            # 处理验证码
            self.crack_sougou(search_url)
            print("验证完毕")
            # 被跳过的公众号要不要抓取  大概 4次
            return
        account_match = re.search(r'微信号:\w*', e.text())
        account_search = account_match.group().replace('微信号:', '') if account_match else ''

        homepage = self.s.get(account_link, cookies=self.cookies)
        if '<title>请输入验证码 </title>' in homepage.text:
            print("出现验码")
            from verification_code import captch_upload_image
            print('------开始处理微信验证码------')
            cert = random.random()
            image_url = 'https://mp.weixin.qq.com/mp/verifycode?cert={}'.format(cert)
            respones = self.s.get(image_url, )
            captch_input = captch_upload_image(respones.content)
            print('------验证码:{}------'.format(captch_input))
            data = {
                'cert': cert,
                'input': captch_input
            }
            respones = self.s.post(image_url, data=data, cookies=self.cookies)
            cookies = requests.utils.dict_from_cookiejar(respones.cookies)
            print('adffa', cookies)
            homepage = self.s.get(account_link, cookies=self.cookies)
            print('破解验证码之后')
        account = pq(homepage.text)('.profile_account').text().replace('微信号: ', '')
        # 搜索页面有account,公众号主页有account,确保找到account
        return homepage.text, account or account_search
예제 #7
0
def parse_posts_html(posts_html):
    log("begin parse_posts_html")
    e = pq(posts_html)
    results = e(".userContentWrapper")
    for count, item in enumerate(results):
        item = pq(item)
        post = PostData()
        post.account_name = item(".fwb").text()
        post.content = item(".userContent").text()
        post.time = item(".timestampContent").text().split(" ")[0]
        post.praise = item('._ipn')("._3t54").text().split("\n")[-1][:1]
        post.share = item(".UFIShareLink").text()

        if '年' in post.time:
            post.year = post.time.split("年")[0]
        else:
            post.year = 2018

        if '月' and '年' in post.time:
            post.month = post.time.split("年")[-1].split('月')[0]
        elif '月' in post.time:
            post.month = post.time.split('月')[0]

        post.day = post.time.split("月")[-1]
        post_dict = post.obj_to_dict()
        log("result parse_posts_html{}".format(post_dict))
        urun['post'].insert(post_dict)
        log('insert {} success'.format(post.account_name))
예제 #8
0
def parse_url(url_dict):
    driver = driver_facebook()
    for count, u in enumerate(url_dict):
        # if count <= 100:
        #     continue
        try:
            link = u.get('link')
            name = u.get('name')
            log("begin name{}".format(name))
            driver.get(link)
            time.sleep(1)
            index_html = driver.page_source

            post = personal_data(index_html)

            post.account_name = name
            post.home_page = link

            urun['test'].insert(
                {
                    "account_name": post.account_name,
                    'home_page': post.home_page,
                    'location': post.location,
                    'come_form': post.come_form,
                    "job": post.job,
                    'followers': post.followers,
                    "degree": post.degree,
                    "sex": post.sex,
                    "is_get": True
                 }
            )

            log("insert {} sucessful".format(post.account_name))
            time.sleep(randint(2, 5))
            if count >= 10:
                break
        except Exception as e:
            log(count, e)
            continue
예제 #9
0
    def run(self):
        name_list = self.get_name()
        for name in name_list:
            # name = '大鼎豫剧'
            log('start {}'.format(name))
            self.name = name
            _tuple = self.account_homepage(name)
            # 跳过 搜索不到的公众号
            if _tuple:
                html, _account = self.account_homepage(name)
            else:
                log('not find {}'.format(self.name))
                continue
            # 所有文章链接
            items = re.findall('"content_url":".*?,"copyright_stat"', html)
            backpack_list = []
            for page_count, item in enumerate(items):
                url_last = item[15:-18].replace('amp;', '')
                url = 'https://mp.weixin.qq.com' + url_last
                article = Article()
                article.create(url)
                if article.is_share is True:
                    continue
                log("catch {}".format(article.title))
                account = Account()
                # account 读文件跟信源搜索不一样
                account.name = article.author
                account.account = article.account
                account.get_account_id()
                entity = JsonEntity(article, account)
                backpack = Backpack()
                # 文章为分享
                # try:
                backpack.create(entity)
                # except Exception as e:
                #     log(e)
                #     continue
                backpack_list.append(backpack.create_backpack())

                # 上传数据库
                sql = '''   
                    INSERT INTO 
                        account_http(article_url, addon, account, account_id, author, id, title) 
                    VALUES 
                        (%s, %s, %s, %s, %s, %s, %s)
                '''
                _tuple = (
                    article.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id,
                    entity.title
                )
                uploads_mysql(config_mysql, sql, _tuple)
                # if page_count == 30:
                #     break
            log('catch {} successul 共{}条文章'.format(self.name, page_count))

            log("发包")
            if entity:
                entity.uploads(backpack_list)
                log("uploads successful")
예제 #10
0
    def run(self):
        # self.set_name()
        # while True:
        account_list = ['大数据发布', '上海港湾集团', '绿盟365', '酌梦录', '瞄了个喵', '豪德通讯', '魔都娱乐1', '大侠的小宇宙', '澳洲梦', '盛世路跑', '佛系金融女',
                        '中卫今日热点', '金华社区居委会', '昕说法', '华农海洋研会', '尘埃一生', '革镇堡街道普法', '速度车行', '七分钟高清视频', '摘星少女酱',
                        '青海省格尔木市健桥医院', '乐用好车', '最强省钱喵喵君', '石柱港航', '荣盛物业长沙花语馨苑客服中心', '汕头超声集团', '中奥吴郡半岛', '隽永人生',
                        '飞鸿影视传媒', 'RGSE义乌雨具遮阳及防护用品展']

        articles = []
        ID = hash_md5(self.name)

        for name in account_list:
            if len(name) == 0:
                continue
            self.name = name

            html_account = self.account_homepage()
            if html_account:
                html, account_of_homepage = html_account
            else:
                continue
            log('start 公众号: ', self.name)
            urls_article = self.urls_article(html)

            account = Account()
            account.name = self.name
            account.account = account_of_homepage
            account.get_account_id()

            backpack_list = []
            for page_count, url in enumerate(urls_article):
                # if page_count < 35:
                #     continue
                article = Article()
                article.create(url, self.name)
                log('文章标题:', article.title)
                log("第{}条".format(page_count))

                entity = JsonEntity(article, account)
                backpack = Backpack()
                backpack.create(entity)
                backpack_list.append(backpack.create_backpack())

                # 所有文章
                article_info = backpack.to_dict()
                articles.append({ID: article_info})
                # 上传数据库
                import pymongo
                conn = pymongo.MongoClient('120.78.237.213', 27017)
                sql = '''
                        INSERT INTO
                            account_http(article_url, addon, account, account_id, author, id, title)
                        VALUES
                            (%s, %s, %s, %s, %s, %s, %s)
                '''
                _tuple = (
                    article.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id,
                    entity.title
                )
                uploads_mysql(config_mysql, sql, _tuple)
                # if page_count == 5:
                #     break

        log("发包")
        if entity:
            entity.uploads(backpack_list)
예제 #11
0
    'db': MYSQL_DATABASE,
}

db = pymysql.connect(**config_mysql)
cursor = db.cursor()
cursor_save = db.cursor()

cursor.execute('select * FROM imagefail_header_url')
count = 0
urls = cursor.fetchmany(5776)
urls = cursor.fetchmany(2000)
proxies = {"https": "http://localhost:1080", }

for index, url_tuple in enumerate(urls):
    numb, post_id, site, url = url_tuple
    id = post_id
    try:
        if not url:
            continue
        print(url)
        resp = requests.get(url, proxies=proxies)
        with open('img_header/{}.png'.format(post_id), 'wb') as f:
            f.write(resp.content)
        test['save_img_mysql'].insert({'id':numb, 'header_url': url, 'blogger_id': id})
        # print('save {}'.format(id))
        log('第{}次 save {} {}'.format(index, numb, id))
    except Exception as e:
        log(e)
        log('=============')
        log(index, post_id)
예제 #12
0
from setting import log

logging = log()


class InfoBulider:
    def __init__(self, postion, city):
        self.__postion = postion
        self.__city = city

    def urlbulider(self):
        url = 'https://www.lagou.com/jobs/positionAjax.json?{}&needAd' \
              'dtionalResult=false'.format(self.__city)
        return url

    def headersbulider(self):
        headers = {
            'Host': 'www.lagou.com',
            'Origin': 'https://www.lagou.com',
            'Referer': 'https://www.lagou.com/jobs/list_{}?labelWords=&fromSearch=true&s'
                       'uginput='.format(self.__postion),
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like' \
                          ' Gecko) Chrome/69.0.3497.81 Safari/537.36',
            'X-Anit-Forge-Code': '0',
            'X-Anit-Forge-Token': None,
            'X-Requested-With': 'XMLHttpRequest'
        }
        return headers

    def databulider(self, page):
        logging.info('data bulider working')
예제 #13
0
파일: collect.py 프로젝트: whoiskx/com_code
    def account_homepage(self):
        # 搜索并进入公众号主页
        count = 0
        while True:
            search_url = self.url.format(self.name)
            resp_search = self.s.get(search_url, headers=self.headers, cookies=self.cookies)

            if '相关的官方认证订阅号' in resp_search.text:
                log("找不到该公众号: {}".format(self.name))
                return
            e = pq(resp_search.text)
            if self.name in e(".tit").eq(0).text():
                account_link = e(".tit").find('a').attr('href')
            elif len(e(".tit").eq(0).text()) > 1:
                log("不能匹配正确的公众号: {}".format(self.name))
                return
            else:
                log(search_url)
                # log(resp_search.text)
                log('验证之前的cookie', self.cookies)
                try_count = 0
                while True:
                    try_count += 1
                    self.crack_sougou(search_url)
                    if '搜公众号' in self.driver.page_source:
                        log('------cookies更新------')
                        cookies = self.driver.get_cookies()
                        new_cookie = {}
                        for items in cookies:
                            new_cookie[items.get('name')] = items.get('value')
                        self.cookies = new_cookie
                        log('------cookies已更新------', self.cookies)
                        break
                    elif try_count > 6:
                        log("浏览器验证失败")
                        break

                log("验证完毕")
                time.sleep(2)
                # 被跳过的公众号要不要抓取  大概 4次
                continue
            account_match = re.search(r'微信号:\w*', e.text())
            account_search = account_match.group().replace('微信号:', '') if account_match else ''

            homepage = self.s.get(account_link, cookies=self.cookies)
            if '<title>请输入验证码 </title>' in homepage.text:
                print("出现验码")
                print('------开始处理微信验证码------')
                cert = random.random()
                image_url = 'https://mp.weixin.qq.com/mp/verifycode?cert={}'.format(cert)
                respones = self.s.get(image_url, )
                captch_input = captch_upload_image(respones.content)
                print('------验证码:{}------'.format(captch_input))
                data = {
                    'cert': cert,
                    'input': captch_input
                }
                respones = self.s.post(image_url, data=data, cookies=self.cookies)
                cookies = requests.utils.dict_from_cookiejar(respones.cookies)
                print('adffa', cookies)
                homepage = self.s.get(account_link, cookies=self.cookies)
                print('破解验证码之后')
            account = pq(homepage.text)('.profile_account').text().replace('微信号: ', '')
            # 搜索页面有account,公众号主页有account,确保找到account
            return homepage.text, account or account_search
예제 #14
0
파일: collect.py 프로젝트: whoiskx/com_code
    def crack_sougou(self, url):
        log('------开始处理未成功的URL:{}'.format(url))
        if re.search('weixin\.sogou\.com', url):
            log('------开始处理搜狗验证码------')
            self.driver.get(url)
            time.sleep(2)
            if '搜公众号' in self.driver.page_source:
                for i in range(30):
                    self.driver.get(url)
                    log('浏览器页面正常')
                    if '搜公众号' not in self.driver.page_source:
                        break
            try:
                img = self.wait.until(EC.presence_of_element_located((By.ID, 'seccodeImage')))
                log('------出现验证码页面------')
                location = img.location
                size = img.size
                left = location['x']
                top = location['y']
                right = location['x'] + size['width']
                bottom = location['y'] + size['height']
                screenshot = self.driver.get_screenshot_as_png()
                screenshot = Image.open(BytesIO(screenshot))
                captcha = screenshot.crop((left, top, right, bottom))
                captcha_path = os.path.join(IMAGE_DIR, CAPTCHA_NAME)
                captcha.save(captcha_path)
                with open(captcha_path, "rb") as f:
                    filebytes = f.read()
                captch_input = captch_upload_image(filebytes)
                log('------验证码:{}------'.format(captch_input))
                if captch_input:
                    input_text = self.wait.until(EC.presence_of_element_located((By.ID, 'seccodeInput')))
                    input_text.clear()
                    input_text.send_keys(captch_input)
                    submit = self.wait.until(EC.element_to_be_clickable((By.ID, 'submit')))
                    submit.click()
                    time.sleep(2)
                    try:
                        if '搜公众号' not in self.driver.page_source:
                            log('验证失败')
                            return
                        log('------验证码正确------')
                    except:
                        log('--22222222----验证码输入错误------')
            except Exception as e:
                log('------未跳转到验证码页面,跳转到首页,忽略------')

        elif re.search('mp\.weixin\.qq\.com', url):
            log('------开始处理微信验证码------')
            cert = random.random()
            image_url = 'https://mp.weixin.qq.com/mp/verifycode?cert={}'.format(cert)
            respones = self.s.get(image_url, cookies=self.cookies)
            captch_input = captch_upload_image(respones.content)
            log('------验证码:{}------'.format(captch_input))
            data = {
                'cert': cert,
                'input': captch_input
            }
            self.s.post(image_url, cookies=self.cookies, data=data)
            log('------cookies已更新------')
예제 #15
0
    return config_mysql


if __name__ == '__main__':
    mysql_params = mysql_localhost()
    import pymysql

    db = pymysql.connect(**mysql_params)
    cursor = db.cursor()
    cursor.execute("SELECT * FROM `twfb_copy` where Url like '%facebook%';")
    # 9-19 1145 UID Newmacau https://zh-hk.facebook.com/Newmacau/
    items = cursor.fetchmany(1114)
    items = cursor.fetchall()
    driver = webdriver.Chrome()
    for count, item in enumerate(items):
        log('第{}次, '.format(count))
        print(item)
        uid = item[0]
        url = item[2]
        if ('https' not in url) or ('小麗民主教室' in url):
            continue
        # url = 'https://www.facebook.com/pg/dokul1988/about/?ref=page_internal'
        driver.get(url)
        choice = ''
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'inputtext')))

        try:
            # 群组 https://www.facebook.com/theeclecticsbasement/
            driver.find_element_by_css_selector('._64-f')
            info = group(driver)
예제 #16
0
cursor.execute('select * FROM imagefail_header_url_copy')
count = 0
urls = cursor.fetchmany(475)
urls = cursor.fetchmany(500)
proxies = {
    "https": "http://localhost:1080",
}

for index, url_tuple in enumerate(urls):
    numb, post_id, site, url = url_tuple
    id = post_id
    try:
        if not url:
            continue
        print(url)
        resp = requests.get(url, proxies=proxies)
        with open('img_mysql/{}.png'.format(post_id), 'wb') as f:
            f.write(resp.content)
        test['save_img_mysql'].insert({
            'id': numb,
            'header_url': url,
            'blogger_id': id
        })
        # print('save {}'.format(id))
        log('{} save {}'.format(index, id))
    except Exception as e:
        log(e)
        log('=============')
        log(index, post_id)
예제 #17
0
 def uploads(self, backpack_list):
     if backpack_list:
         sever1 = 'http://115.231.251.252:26016/'
         sever2 = 'http://60.190.238.168:38015/'
         body = json.dumps(backpack_list)
         # 保证发送成功
         count = 0
         while True:
             if count > 2:
                 break
             try:
                 log('start uploads')
                 r = requests.post(sever1, data=body)
                 if r.status_code == 200:
                     log('uploads server1 successful')
             except Exception as e:
                 log('uploads http error1', e)
             try:
                 r2 = requests.post(sever2, data=body)
                 if r2.status_code == 200:
                     log('uploads server2 successful')
                     break
             except Exception as e:
                 log('uploads http error2', e)
             count += 1
         log('uploads over')
예제 #18
0
import requests

from setting import test, log
# proxies = { 'https': "http//localhost:1080"}
proxies = {
    "https": "http://localhost:1080",
}
for index, item in enumerate(test['img_herder_merge'].find()):
    # blogger_id = item.get('url').split('com/')[-1]
    try:
        blogger_id = item.get('blogger_id')
        url = item.get('header_url')
        id = item.get('id')
        # if int(id) < 5976:
        #     continue
        # print(id, url)
        log(id, url)
        if not url:
            test['save'].insert({'id': id, 'header_url': ''})
            continue
        resp = requests.get(url, proxies=proxies)
        with open('img_header_6000/{}.png'.format(blogger_id), 'wb') as f:
            f.write(resp.content)
        test['save_img'].insert({'id': id, 'header_url': url})
        # print('save {}'.format(id))
        log('{} save {}'.format(index, id))
    except Exception as e:
        log(e)
        log('=============')
        log(id, blogger_id)
예제 #19
0
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()

from setting import log

count = 0
while True:
    url = 'https://www.facebook.com/permalink.php?story_fbid=2163264660369593&id=188533647842714'
    driver.get(url)
    element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "fwn")))
    Author = driver.find_element_by_class_name('fwb').text
    count += 1
    print(count)
    log(count)
예제 #20
0
            try:
                header_url_person_div = driver.find_element_by_css_selector(
                    '.scaledImageFitWidth.img')
                header_url_person = header_url_person_div.get_attribute('src')
            except Exception as e:
                pass

            try:
                header_url_group_div = driver.find_element_by_css_selector(
                    '._4jhq.img')
                header_url_group = header_url_group_div.get_attribute('src')
            except Exception as e:
                pass

        except Exception as e:
            log(e)
            log('not find')

        try:
            header_url = header_url_person or header_url_group
            d = {
                'id': numb,
                'url': url,
                'header_url': header_url,
                'post_id': post_id,
            }

            test['img_header_url_10000'].insert(d)
            count += 1
            log('{} 已保存'.format(numb))
        except Exception as e:
예제 #21
0
def parse_members_url(url_dict):
    driver = driver_facebook()
    error_count = 0
    for count, u in enumerate(url_dict):
        # if count <= 10:
        #     log("skip {} {}".format(count, u.get('name')))
        #     continue
        link = u.get('url')
        name = u.get('name')
        log("begin {} : {}", count, name)
        try:
            driver.get(link)
            time.sleep(2)
            post = MembersData()
            index_html = driver.page_source
            data_sex = re.findall(
                r'"addFriendText".*?<', index_html) or re.findall(
                    r'<span class="FollowLink">.*?</span>', index_html)
            log(data_sex)
            if data_sex != []:
                if '他' in data_sex[0]:
                    post.sex = 'man'
                if "她" in data_sex[0]:
                    post.sex = "woman"

            profile = re.findall(
                r'<div id="intro_container_id">.*?</ul></div>', index_html)
            if profile == []:
                error_count += 1
                log("error {} : {} {}".format(error_count, count, link))
            e = pq(profile[0])
            all_profile = e.text()
            log(all_profile)
            list_profile = all_profile.split("\n")

            for item in list_profile:
                if ("曾经" in item or '就读于' in item) and post.degree == '':
                    post.degree = item
                elif "所在地" in item:
                    post.location = item
                elif "来自" in item:
                    post.come_form = item
                elif "粉丝" in item:
                    post.followers = item
                elif "-" in item and post.job == '' and '曾经' not in item:
                    post.job = item

            post.account_name = name
            post.home_page = link
            log("post", post)
            urun['test'].insert({
                "account_name": post.account_name,
                'home_page': post.home_page,
                'location': post.location,
                'come_form': post.come_form,
                "job": post.job,
                'followers': post.followers,
                "degree": post.degree,
                "sex": post.sex,
                "is_get": True,
            })
            log("insert {} sucessful".format(name))
            # if count >= 20:
            #     break
        except Exception as e:
            log(count, name, e)
            continue