async def main():
    # We use a session to take advantage of tcp keep-alive
    # Set a 3 second read and connect timeout. Default is 5 minutes
    async with aiohttp.ClientSession(conn_timeout=3, read_timeout=3) as session:
        tasks = [(async_download_link(session, link)) for link in get_links()]
        # gather aggregates all the tasks and schedules them in the event loop
        await asyncio.gather(*tasks, return_exceptions=True)
def main():
    '''
    scrapes all of today's articles on https://blogs.fangraphs.com/
    '''
    # get the links to all the articles
    url = 'https://blogs.fangraphs.com/'
    response = requests.get(url)
    page = response.text
    parser = "html.parser"
    soup = BeautifulSoup(page, parser)
    links = utils.get_links(soup)

    # if an article was written today, save it
    today_datetime = datetime.datetime.now().date()
    article_title_lst = []
    article_date_lst = []
    article_text_lst = []

    for link in links:
        response = requests.get(link)
        page = response.text
        soup = BeautifulSoup(page, parser)
        article_date = utils.get_article_date(soup)
        article_date_datetime = datetime.datetime.strptime(article_date,
            '%B %d, %Y').date()
        if today_datetime == article_date_datetime:
            article_title = utils.get_article_title(soup)
            article_text = utils.get_article_text(soup)
            article_title_lst.append(article_title)
            article_date_lst.append(article_date)
            article_text_lst.append(article_text)

    return article_title_lst, article_date_lst, article_text_lst
示例#3
0
def multi_threaded_execution(num_of_threads=8):
    class WorkerThread(threading.Thread):
        def run(self):
            while True:
                dir, link = queue.get()
                download_link(dir, link)
                queue.task_done()

    start_ts = time()
    download_dir = setup_download_dir()
    links = get_links(CLIENT_ID)
    count = len(links)
    queue = Queue.Queue()

    for i in range(num_of_threads):
        t = WorkerThread()
        t.daemon = True
        t.start()

    for link in links:
        queue.put((download_dir, link))

    queue.join()
    logging.info('Time Taken to Parallely download %s images took %s Seconds',
                 count,
                 time() - start_ts)
示例#4
0
def crawl(records, to_be_visited_queue, item_topic_arn, max_children=999):
    """
    Consume catalog from queue and insert children for
    future visit

    Input:
      records(list): catalogs to be visited
      to_be_visiited_queue(string): SQS to insert children
      item_sns_arn: ARN of SNS topic that receives the STAC items
      max_children(int): maximum number of children to be visited
    """

    for record in records:
        catalog = url_to_json(record['body'])
        clinks, items = get_links(catalog, record['body'])
        # Children catalogs are placed into the queue to be visited
        for index, clink in enumerate(clinks):
            if index == max_children:
                break
            SQS_CLIENT.send_message(QueueUrl=to_be_visited_queue,
                                    MessageBody=clink)
            print('Catalog inserted: ', clink)
        # Items are sent to SNS topic
        for item in items:
            json_item = url_to_json(item)
            SNS_CLIENT.publish(TargetArn=item_topic_arn,
                               Message=json.dumps(json_item))
示例#5
0
    def parse_tweet(self, response):
        # logging.info('Processing --> ' + response.url)
        username = response.xpath(
            '//*[@class="permalink-inner permalink-tweet-container"]//*[@class="username u-dir u-textTruncate"]/b/text()'
        ).get(default='')
        full_name = response.xpath(
            '//*[@class="permalink-inner permalink-tweet-container"]//*[@class="FullNameGroup"]/strong/text()'
        ).get(default='')

        try:
            tweet_text = response.xpath('//title/text()').get(
                default='').split(':')[1].strip()

        except:
            tweet_text = ' '.join(
                response.xpath(
                    '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-text-container"]/p//text()'
                ).getall()).strip()
        image_list = response.xpath(
            '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="AdaptiveMediaOuterContainer"]//img/@src'
        ).getall()
        date_time = response.xpath(
            '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="client-and-actions"]/span[@class="metadata"]/span/text()'
        ).get(default='')

        date_time = parser.parse(date_time.replace(
            '-', '')).strftime('%Y-%m-%d %H:%M:%S')
        retweets = response.xpath(
            '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-retweets stat-count"]/a/strong/text()'
        ).get(default='')

        likes = response.xpath(
            '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-favorites stat-count"]/a/strong/text()'
        ).get(default='')
        replies = response.xpath(
            '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[contains(@id,"profile-tweet-action-reply-count")]/parent::span/@data-tweet-stat-count'
        ).get(default='')

        mentions = get_mentions(tweet_text)
        hashtags = get_hashtags(tweet_text)
        cta = get_links(tweet_text)

        result = {
            'username': username.lower(),
            'full_name': full_name,
            'twitter_url': response.url,
            'tweet_text': tweet_text,
            'tweet_time': str(date_time),
            'number_of_likes': str(likes),
            'no_of_retweets': str(retweets),
            'no_of_replies': str(replies),
            'mentions': ' | '.join(mentions),
            'no_of_mentions': str(len(mentions)),
            'hashtags': ' | '.join(hashtags),
            'no_of_hashtags': str(len(hashtags)),
            'call_to_action': ' | '.join(cta),
            'image_url': ' | '.join(image_list),
            'tag': self.tag
        }
        yield result
示例#6
0
def queue_backed_execution():
    start_time = time()
    download_dir = setup_download_dir()
    links = get_links(CLIENT_ID)
    count = len(links)
    q = rq(connection=Redis(host='localhost', port=6379))
    for link in links:
        q.enqueue(download_link, download_dir, link)
示例#7
0
def get_my_url():
    """
    获取自己的主页作为起始也页面返回。
    """
    myself_soup = utils.get_links(
        session, "https://www.zhihu.com/settings/profile")
    my_url = myself_soup.find(
        "div", {"id": "js-url-preview", "class": "url-preview"})
    return "https://www." + my_url.get_text()
示例#8
0
def single_threaded_execution():
    start_ts = time()
    download_directory = setup_download_dir()
    links = get_links(CLIENT_ID)
    count = len(links)
    for link in links:
        download_link(download_directory, link)
    logging.info(
        'Time Taken to Sequentially download %s images took %s Seconds', count,
        time() - start_ts)
示例#9
0
def crawlmainpage(url, session):
    print("entering crawl mainpage")
    mainpage_soup = utils.get_links(session, url)
    #print(mainpage_soup)
    #print('\n')
    mainpage_soup_str = str(mainpage_soup)
    with open('../mainpage.txt', 'wt', encoding='utf-8') as main:
        main.write(mainpage_soup_str)
    firstrule = re.compile(
        r'https:\\u002F\\u002Fapi.zhihu.com\\u002Fquestions\\u002F[0-9]+')
    firstmatch = re.findall(firstrule, mainpage_soup_str)
示例#10
0
def multi_process_execution(pool_size=8):
    start_time = time()
    download_dir = setup_download_dir()
    links = get_links(CLIENT_ID)
    count = len(links)
    download = partial(download_link, download_dir)
    p = Pool(pool_size)
    p.map(download, links)
    logging.info(
        'Time Taken to Multiprocess download %s images took %s Seconds', count,
        time() - start_time)
示例#11
0
def get_my_url():
    """
    获取自己的主页作为起始也页面返回。
    """
    myself_soup = utils.get_links(session,
                                  "https://www.zhihu.com/settings/profile")
    my_url = myself_soup.find("div", {
        "id": "js-url-preview",
        "class": "url-preview"
    })
    return "https://www." + my_url.get_text()
示例#12
0
def get_followees(user_url):
    """
    获取一个用户的关注列表,如果关注人数很多,网站只会显示部分,其余的部分会AJAX动态刷新。我们只抓取初始的那部分。
    """
    user_followees_url = user_url + "/followees"
    followees_list = []
    followees_soup = utils.get_links(session, user_followees_url)
    for i in followees_soup.find_all("span", {"class": "author-link-line"}):
        followee_url = i.find("a").attrs['href']
        followees_list.append(followee_url)
    if followees_list:
        return followees_list
    raise NoFolloweeError
示例#13
0
def get_followees(user_url):
    """
    获取一个用户的关注列表,如果关注人数很多,网站只会显示部分,其余的部分会AJAX动态刷新。我们只抓取初始的那部分。
    """
    user_followees_url = user_url + "/followees"
    followees_list = []
    followees_soup = utils.get_links(session, user_followees_url)
    for i in followees_soup.find_all("span", {"class": "author-link-line"}):
        followee_url = i.find("a").attrs['href']
        followees_list.append(followee_url)
    if followees_list:
        return followees_list
    raise NoFolloweeError
示例#14
0
文件: lnk2bak.py 项目: aurelg/linkbak
def main():
    """
    Run the script
    """
    args = parse_args()
    setup_logging(args)
    outdir = Path(get_output_dir())

    if not outdir.exists():
        outdir.mkdir()
    copy_ui()

    nb_workers = args.j if args.j else os.cpu_count()
    get_logger().warning("Using %s workers", nb_workers)

    if nb_workers > 1:
        with contextlib.closing(multiprocessing.Pool(nb_workers)) as pool:
            pool.starmap(start_link_handler,
                         [(l, args) for l in get_links(args.file[0])])
    else:
        for link in get_links(args.file[0]):
            start_link_handler(link, args)
示例#15
0
def create_links_pagelist(pagelist_noclean):
    start_time = time.time()
    pagelist_links = []
    listsize = len(pagelist_noclean)

    for i, (id, title, content) in enumerate(pagelist_noclean):
        links = get_links(content)
        pagelist_links.append((id, title, links))
        print_percentage(i, listsize)

    elapsed_time = time.time() - start_time
    print("  ** Finish create links pagelist")
    print("  - Elapsed time create links pagelist : {}".format(
        hms_string(elapsed_time)))
    return pagelist_links
示例#16
0
def get_my_url(session):
    """
    获取自己的主页作为起始也页面返回。
    """
    myself_soup = utils.get_links(session, val['account_url'])
    # with open('../temp.txt', 'rt', encoding='utf-8') as temp:
    #     with open('../accoutpage.txt', 'wt',encoding='utf-8') as accoutp:
    #         accoutp.truncate()
    #         accoutp.write(temp.read())
    print(myself_soup)
    myidrule = re.compile(r'(?<=people","id":")[0-9a-z]+')
    myid = re.search(myidrule, str(myself_soup))
    try:
        my_id = myid.group(0)
    except AttributeError as ae:
        return False
    print("\n\nmy_id is " + my_id)
    return "https://www.zhihu.com/people/" + my_id
示例#17
0
    def run(self):
        """
        启动线程。
        """
        while True:
            self.get_task()

            # 输出正在抓取的URL
            print("Thread #" + str(self.thread_grade) + ": " + self.url)

            soup = utils.get_links(self.session, self.url, self.proxy)

            if not soup:
                print("Url Error")
                continue
            data_dict = self.get_info(soup)

            # 储存到数据库
            dbAPI.store_by_mongodb(data_dict)

            # 控制抓取速度
            time.sleep(val['sleep'])
示例#18
0
    def run(self):
        """
        启动线程。
        """
        while True:
            self.get_task()

            # 输出正在抓取的URL
            print("Thread #" + str(self.thread_grade) + ": " + self.url)

            soup = utils.get_links(self.session, self.url, self.proxy)

            if not soup:
                print("Url Error")
                continue
            data_dict = self.get_info(soup)

            # 储存到数据库
            dbAPI.store_by_mongodb(data_dict)

            # 控制抓取速度
            time.sleep(val['sleep'])
示例#19
0
def get_followees(user_url, session):
    """
    获取一个用户的关注列表,如果关注人数很多,网站只会显示部分,其余的部分会AJAX动态刷新。我们只抓取初始的那部分。
    """
    user_followees_url = user_url + "/following"
    print(user_followees_url)
    followees_list = []
    followees_soup = utils.get_links(session, user_followees_url)
    temp = open("../following.txt", 'w', encoding='utf-8')
    temp.write(str(followees_soup.prettify()))
    temp.close()
    for i in followees_soup.find_all("a", {
            "class": "UserLink-link",
            "data-za-detail-view-element_name": "User"
    }):
        print("this is i " + i.get('href'))

        followee_url = "http:" + i.get('href')
        #remove lots of unvaild varieties
        if followee_url not in followees_list:
            followees_list.append(followee_url)
    if followees_list:
        return followees_list
    raise NoFolloweeError
示例#20
0
"""
Download images asynchronous use concurrent.futures

"""
from concurrent.futures import ThreadPoolExecutor
from time import time

from utils import get_links, download_link

if __name__ == '__main__':
    ts = time()
    links = get_links()

    # By placing the executor inside a with block, the executors shutdown method
    # will be called cleaning up threads.
    #
    # By default, the executor sets number of workers to 5 times the number of
    # CPUs.
    with ThreadPoolExecutor(4) as executor:
        executor.map(download_link, links, timeout=30)

    print("Total time download: {}s".format(time() - ts))
示例#21
0
"""
Download images synchronous

"""

from utils import get_links, download_link
from time import time

if __name__ == '__main__':
    start_time = time()
    for link in get_links():
        download_link(link)
    print("Total time download: {}s".format(time() - start_time))
示例#22
0
    def process_detail(self, data):
        # -1- 先更新该抓取页面的check_time
        assert len(data) == self.detail_len
        head_dict = json.loads(data[0:self.piece1_len - 1])
        #print head_dict
        update_item = {}
        update_item['url_no'] = head_dict['url_no']
        update_item['check_time'] = head_dict['timestamp']
        update_item['status_code'] = head_dict['Status-Code']
        ret = self.linkdb_logic.update(update_item)
        if ret['retcode'] != 0:
            logging.warning("update error. uno[%u] status_code[%u] check_time[%u] code[%u] mesg[%s] fno[%u] bid[%u]" \
                    % (update_item['url_no'], update_item['status_code'], update_item['check_time'], \
                    ret['retcode'], ret['message'], self.fileno, self.blockid))
        #f = open('data.o', 'w')
        #f.write(utils.unzipData(data[self.piece1_len:]))
        #f.close()
        #sys.exit(1)
        #log_message = "log_id[%u] p1l[%u] zlen[%u] delay[%u] jsonstr[%s]" \
        #        % (self.log_id, self.piece1_len, self.detail_len-self.piece1_len, \
        #        data[0:self.piece1_len - 1], int(time.time()) - head_dict['timestamp'])

        # -2- 分析页面,得到一系列扩展连接(只在域内扩展)
        assert self.detail_len >= self.piece1_len
        if (head_dict['Status-Code'] == 301 or head_dict['Status-Code']
                == 302) and head_dict.has_key('Location'):
            pass
            # 跳转的都不要!
            #link_item = {}
            #try:
            #    link_item['refer_sign']  = long(hashlib.md5(head_dict['url']).hexdigest()[:16], 16) & long('7fffffffffffffff', 16)
            #except:
            #    rfurl = 'http://'+head_dict['host']
            #    link_item['refer_sign']  = long(hashlib.md5(rfurl).hexdigest()[:16], 16) & long('7fffffffffffffff', 16)
            #link_item['creat_time']  = head_dict['timestamp']
            #link_item['host_no']     = head_dict['host_no']
            #link_item['url_type']    = 0
            #link_item['status_code'] = 0
            #link_item['url']         = str(head_dict['Location'])
            #link_item['url_sign']  = long(hashlib.md5(link_item['url']).hexdigest()[:16], 16) & long('7fffffffffffffff', 16)
            #link_item['check_time']  = 0;

            ##print 'link_item["url_sign"]:', link_item['url_sign']
            #url_sign_retlist = self.linkdb_logic.select_url_sign([link_item['url_sign'],])
            #url_sign_set = set([x['url_sign'] for x in url_sign_retlist])
            #if link_item['url_sign'] not in url_sign_set:
            #    ret = self.linkdb_logic.insert(link_item)
            #    if ret['retcode'] != 0:
            #        logging.warning('redirection insert fail. retcode:%u message:%s status_code:%u jsonstr:%s fno:%u bid:%u' %\
            #                (ret['retcode'], ret['message'], head_dict['Status-Code'], data[0:self.piece1_len - 1], \
            #                self.fileno, self.blockid,))
        elif head_dict[
                'Status-Code'] == 200 and self.detail_len > self.piece1_len:
            #print '-'*80
            t = time.time()
            try:
                page_content = utils.unzipData(data[self.piece1_len:])
                # print page_content
                print "=========================="
                print 'unzip time_cost: %.3f' % (time.time() - t, )
                t = time.time()
                url_set = utils.get_links(page_content,
                                          host=head_dict['host'],
                                          inhost=True,
                                          base_url=head_dict['url'])
                print 'get_links time_cost: %.3f' % (time.time() - t, )
                print 'url_set:', url_set, self.piece1_len, self.detail_len
                #print 'content-len:', len(page_content)
                #self.send_response()
                #sys.exit(1)
                try:
                    refer_sign = long(
                        hashlib.md5(head_dict['url']).hexdigest()[:16],
                        16) & long('7fffffffffffffff', 16)
                    # 先查询url_sign是否已经存在于数据库中
                    url_sign_list = [
                        long(hashlib.md5(url).hexdigest()[:16], 16)
                        & long('7fffffffffffffff', 16) for url in url_set
                    ]

                    if url_sign_list:
                        t = time.time()
                        url_sign_retlist = self.linkdb_logic.select_url_sign(
                            url_sign_list)
                        #print 'select_url_sign time_cost: %.3f' % (time.time() - t,)

                        t = time.time()
                        url_sign_set = set(
                            [x['url_sign'] for x in url_sign_retlist])
                    else:
                        url_sign_set = set([])
                    link_item_list = []
                    for url in url_set:
                        #if -1 != url.find('ActionData.aspx'):
                        #    logging.warning ('got it. %s from %s' % (url, head_dict['url']))
                        #    continue
                        link_item = {}
                        link_item['url_sign'] = long(
                            hashlib.md5(url).hexdigest()[:16], 16) & long(
                                '7fffffffffffffff', 16)
                        if link_item['url_sign'] in url_sign_set:
                            continue
                        else:
                            link_item['refer_sign'] = refer_sign
                            link_item['creat_time'] = head_dict['timestamp']
                            link_item['host_no'] = head_dict['host_no']
                            link_item['check_time'] = 0
                            link_item['url_type'] = 0
                            link_item['status_code'] = 0
                            link_item['url'] = str(url)
                            #print link_item
                            link_item_list.append(link_item)
                    if link_item_list:
                        #print len(link_item_list), head_dict['url']
                        ret = self.linkdb_logic.insert_batch(link_item_list)
                        if ret['retcode'] != 0:
                            logging.warning("insert error. code[%u] mesg[%s] fno[%u] bid[%u]" %\
                                    (ret['retcode'], ret['message'], self.fileno, self.blockid))

                        log_message = "log_id[%u] p1l[%u] zlen[%u] delay[%u] all[%u] err[%u] jsonstr[%s] time_cost[%0.3f] fno[%u] bid[%u]" \
                                % (self.log_id, self.piece1_len, self.detail_len-self.piece1_len, \
                                int(time.time()) - head_dict['timestamp'], len(link_item_list), \
                                ret['failnum'], data[0:self.piece1_len - 1], time.time() - t, self.fileno, self.blockid)
                    else:
                        log_message = 'link_item_list empty. uno[%u] host[%s] delay[%u] p1l[%u] dlen[%u] url_count[%u] filted to zero. fno:%u bid:%u' \
                                % (head_dict['url_no'], head_dict['host'], int(time.time()) - head_dict['timestamp'], self.piece1_len, \
                                self.detail_len, len(url_set), self.fileno, self.blockid)
                    logging.info(log_message)
                    #print 'insert time_cost: %.3f' % (time.time() - t,)

                    #charset_start = page_content.find('charset=')
                    #charset_end   = page_content[charset_start:].find('"')
                    #charset = page_content[charset_start+8 : charset_start+charset_end]
                except:
                    logging.warning('shit happens in url. jsonstr:%s fno[%u] bid[%u]' % \
                        (data[0:self.piece1_len - 1], self.fileno, self.blockid,))
            except:
                logging.warning('unzip fail jsonstr:%s fno:%u bid:%u' % (
                    data[0:self.piece1_len - 1],
                    self.fileno,
                    self.blockid,
                ))
        else:
            logging.warning('shit happens. jsonstr:%s fno[%u] bid[%u]' % (
                data[0:self.piece1_len - 1],
                self.fileno,
                self.blockid,
            ))
            pass
            # TODO got some error
            #sys.exit(1)
        self.send_response()
示例#23
0
def crawlentpage(url, session, fromwhere):
    print("entering crawl entpage\n")
    myrule = re.compile(
        r'<a class=\"zu-top-nav-userinfo\" href=\"\/people\/(.*?)\">')
    followerrule = re.compile(
        r'<a class=\"zg-link author-link\" href=\"\/people\/(.*?)\">')
    nextfollowerrule = re.compile(
        r'<a class=\"zg-link author-link\" href=\"\\/people\\/(.*?)\">')
    ppidrule = re.compile(r'(?<=id="pp-)[0-9a-z]+')
    entpage_soup = utils.get_links(session, url)
    entpage_soup_str = str(entpage_soup)

    mymatch = re.findall(myrule, entpage_soup_str)

    followermatch = re.findall(followerrule, entpage_soup_str)
    ppidmatch = re.findall(ppidrule, entpage_soup_str)
    cirnum = 0
    ip = utils.prival['mongodbnet']['host']
    port = utils.prival['mongodbnet']['port']
    remoteclient = pymongo.MongoClient(str(ip) + ":" + str(port))
    while cirnum < len(followermatch):
        try:
            insertdomainid(remoteclient, val['dbnamenet'], val['colnamenet'],
                           followermatch[cirnum], ppidmatch[cirnum], fromwhere)
        except OSError as ee:
            print("{0}".format(ee))
        cirnum = cirnum + 1
    print('\nmy Domainhack is' + str(mymatch) + '\n')
    print('\nthe followers\' Domainhacks are' + str(followermatch) + '\n')
    print('while their ppid are ' + str(ppidmatch) + '\n')
    if len(followermatch) < 20:
        return
    lastonerule = re.compile(r'(?<=mi-)[0-9]+')
    allstart = re.findall(lastonerule, entpage_soup_str)
    try:
        truestart = allstart[len(allstart) - 1]
        lasttruestart = truestart
    except IndexError as ie:
        print("{0}".format(ie))
        return
    print('\nthe truestart is: ' + truestart)
    time.sleep(random.randint(5, 10))
    postnum = 0
    while postnum >= 0:  #utils.prival['flippagenum']:
        offsetnum = 40 + postnum * 20
        startnum = truestart
        print('offsetnum=' + str(offsetnum) + ' startnum=' + str(startnum))
        params = {"offset": str(offsetnum), "start": str(startnum)}
        #params = 'offset='+str(offsetnum)+'&start='+startnum
        #whether, session = zhihu_login.ZhihuAccount(zhihu_login.acc, zhihu_login.sec).login('en', load_cookies)
        nextentpage_soup = utils.mypost(session, url, params)
        try:
            nextentpage_soup_str = str(nextentpage_soup.prettify().encode(
                'latin-1').decode('unicode_escape'))
            if '\"errcode\": 1991832' in nextentpage_soup_str:
                print(nextentpage_soup_str)
                print('please change your account whose status is normal')
        except UnicodeEncodeError:
            nextentpage_soup_str = str(nextentpage_soup)
        nextfollowermatch = re.findall(nextfollowerrule, nextentpage_soup_str)
        ppidmatch = re.findall(ppidrule, nextentpage_soup_str)
        cirnum = 0
        while cirnum < len(nextfollowermatch):
            try:
                insertdomainid(remoteclient, val['dbnamenet'],
                               val['colnamenet'], nextfollowermatch[cirnum],
                               ppidmatch[cirnum],
                               fromwhere)  #,val['univer_name'][num_url])
            except OSError as ee:
                print("{0}".format(ee))
            cirnum = cirnum + 1
        print('\nthe followers\' Domainhacks are' + str(nextfollowermatch) +
              '\n')
        print('while their ppid are ' + str(ppidmatch) + '\n')
        if len(nextfollowermatch) < 20:
            return
        postnum = postnum + 1
        lastonerule = re.compile(r'(?<=mi-)[0-9]+')
        wholestart = re.findall(lastonerule, nextentpage_soup_str)
        if len(wholestart) > 0:
            truestart = wholestart[len(wholestart) - 1]
            if truestart == lasttruestart:
                print('flipping ' + str(postnum) + ' pages')
                return
            lasttruestart = truestart
            print('\nthe truestart is: ' + truestart)
        else:
            print('not found next start')
            print('flipping ' + str(postnum) + ' pages')
            return
        time.sleep(random.randint(10, 15))
示例#24
0
    def login(self, captcha_lang: str = 'en', load_cookies: bool = True):
        """
        模拟登录知乎
        :param captcha_lang: 验证码类型 'en' or 'cn'
        :param load_cookies: 是否读取上次保存的 Cookies
        :return: bool
        """

        if load_cookies and self.load_cookies():
            print('读取 Cookies 文件')
            print('the cookie.txt selected is ' + str(orifromtxt))
            if self.check_login():
                print('登录成功')
                with open(utils.lastfile, 'wt', encoding='utf-8') as tempname:
                    tempname.writelines(utils.cookiepath)
                personinfo = utils.get_links(self.session,
                                             utils.val['apime_url'])
                if personinfo is not str:
                    try:
                        print(personinfo.text)
                    except AttributeError as ae:
                        print(
                            'AttributeError: \'str\' object has no attribute \'text\''
                        )
                else:
                    print(personinfo)
                return True, self.session
            print('Cookies 已过期')

        self._check_user_pass()
        self.login_data.update({
            'username': self.username,
            'password': self.password,
            'lang': captcha_lang
        })

        timestamp = int(time.time() * 1000)
        self.login_data.update({
            'captcha':
            self._get_captcha(self.login_data['lang']),
            'timestamp':
            timestamp,
            'signature':
            self._get_signature(timestamp)
        })

        headers = self.session.headers.copy()
        headers.update({
            'content-type': 'application/x-www-form-urlencoded',
            'x-zse-83': '3_1.1',
            'x-xsrftoken': self._get_xsrf()
        })
        data = self._encrypt(self.login_data)
        login_api = 'https://www.zhihu.com/api/v3/oauth/sign_in'
        resp = self.session.post(login_api, data=data, headers=headers)
        while True:
            if 'error' in resp.text:
                print(json.loads(resp.text)['error'])
                print('i am here' + resp.text)
                self.login_data.update(
                    {'captcha': self._get_captcha(self.login_data['lang'])})
                data = self._encrypt(self.login_data)
                resp = self.session.post(login_api, data=data, headers=headers)
            else:
                break
        if self.check_login():
            print('登录成功')
            with open(utils.lastfile, 'wt', encoding='utf-8') as tempname:
                tempname.writelines(utils.cookiepath)
            personinfo = utils.get_links(self.session, utils.val['apime_url'])
            print(personinfo)
            return True, self.session
        print('登录失败')
        return False
示例#25
0
    def process_detail(self, data):
        # -1- 先更新该抓取页面的check_time
        assert len(data) == self.detail_len
        head_dict = json.loads(data[0:self.piece1_len - 1])
        #print head_dict
        update_item = {}
        update_item['url_no'] = head_dict['url_no']
        update_item['check_time'] = head_dict['timestamp']
        update_item['status_code'] = head_dict['Status-Code']
        ret = self.linkdb_logic.update(update_item)
        if ret['retcode'] != 0:
            logging.warning("update error. uno[%u] status_code[%u] check_time[%u] code[%u] mesg[%s] fno[%u] bid[%u]" \
                    % (update_item['url_no'], update_item['status_code'], update_item['check_time'], \
                    ret['retcode'], ret['message'], self.fileno, self.blockid))
        #f = open('data.o', 'w')
        #f.write(utils.unzipData(data[self.piece1_len:]))
        #f.close()
        #sys.exit(1)
        #log_message = "log_id[%u] p1l[%u] zlen[%u] delay[%u] jsonstr[%s]" \
        #        % (self.log_id, self.piece1_len, self.detail_len-self.piece1_len, \
        #        data[0:self.piece1_len - 1], int(time.time()) - head_dict['timestamp'])

        # -2- 分析页面,得到一系列扩展连接(只在域内扩展)
        assert self.detail_len >= self.piece1_len
        if (head_dict['Status-Code'] == 301 or head_dict['Status-Code'] == 302) and head_dict.has_key('Location'):
            pass
            # 跳转的都不要!
            #link_item = {}
            #try:
            #    link_item['refer_sign']  = long(hashlib.md5(head_dict['url']).hexdigest()[:16], 16) & long('7fffffffffffffff', 16)
            #except:
            #    rfurl = 'http://'+head_dict['host']
            #    link_item['refer_sign']  = long(hashlib.md5(rfurl).hexdigest()[:16], 16) & long('7fffffffffffffff', 16)
            #link_item['creat_time']  = head_dict['timestamp']
            #link_item['host_no']     = head_dict['host_no']
            #link_item['url_type']    = 0
            #link_item['status_code'] = 0
            #link_item['url']         = str(head_dict['Location'])
            #link_item['url_sign']  = long(hashlib.md5(link_item['url']).hexdigest()[:16], 16) & long('7fffffffffffffff', 16)
            #link_item['check_time']  = 0;

            ##print 'link_item["url_sign"]:', link_item['url_sign']
            #url_sign_retlist = self.linkdb_logic.select_url_sign([link_item['url_sign'],])
            #url_sign_set = set([x['url_sign'] for x in url_sign_retlist])
            #if link_item['url_sign'] not in url_sign_set:
            #    ret = self.linkdb_logic.insert(link_item)
            #    if ret['retcode'] != 0:
            #        logging.warning('redirection insert fail. retcode:%u message:%s status_code:%u jsonstr:%s fno:%u bid:%u' %\
            #                (ret['retcode'], ret['message'], head_dict['Status-Code'], data[0:self.piece1_len - 1], \
            #                self.fileno, self.blockid,))
        elif head_dict['Status-Code'] == 200 and self.detail_len > self.piece1_len:
            #print '-'*80
            t = time.time()
            try:
                page_content = utils.unzipData(data[self.piece1_len:])
                #print 'unzip time_cost: %.3f' % (time.time() - t,)
                t = time.time()
                url_set = utils.get_links(page_content, host = head_dict['host'], inhost = True, base_url = head_dict['url'])
                #print 'get_links time_cost: %.3f' % (time.time() - t,)
                #print 'url_set:', url_set, self.piece1_len, self.detail_len
                #print 'content-len:', len(page_content)
                #self.send_response()
                #sys.exit(1)
                try:
                    refer_sign = long(hashlib.md5(head_dict['url']).hexdigest()[:16], 16) & long('7fffffffffffffff', 16)
                    # 先查询url_sign是否已经存在于数据库中
                    url_sign_list = [long(hashlib.md5(url).hexdigest()[:16], 16) & long('7fffffffffffffff', 16) for url in url_set]

                    if url_sign_list:
                        t = time.time()
                        url_sign_retlist = self.linkdb_logic.select_url_sign(url_sign_list)
                        #print 'select_url_sign time_cost: %.3f' % (time.time() - t,)

                        t = time.time()
                        url_sign_set = set([x['url_sign'] for x in url_sign_retlist])
                    else:
                        url_sign_set = set([])
                    link_item_list = []
                    for url in url_set:
                        #if -1 != url.find('ActionData.aspx'):
                        #    logging.warning ('got it. %s from %s' % (url, head_dict['url']))
                        #    continue
                        link_item = {}
                        link_item['url_sign']  = long(hashlib.md5(url).hexdigest()[:16], 16) & long('7fffffffffffffff', 16)
                        if link_item['url_sign'] in url_sign_set:
                            continue
                        else:
                            link_item['refer_sign']  = refer_sign
                            link_item['creat_time']  = head_dict['timestamp']
                            link_item['host_no']     = head_dict['host_no']
                            link_item['check_time']  = 0
                            link_item['url_type']    = 0
                            link_item['status_code'] = 0
                            link_item['url'] = str(url)
                            #print link_item
                            link_item_list.append(link_item)
                    if link_item_list:
                        #print len(link_item_list), head_dict['url']
                        ret = self.linkdb_logic.insert_batch(link_item_list)
                        if ret['retcode'] != 0:
                            logging.warning("insert error. code[%u] mesg[%s] fno[%u] bid[%u]" %\
                                    (ret['retcode'], ret['message'], self.fileno, self.blockid))

                        log_message = "log_id[%u] p1l[%u] zlen[%u] delay[%u] all[%u] err[%u] jsonstr[%s] time_cost[%0.3f] fno[%u] bid[%u]" \
                                % (self.log_id, self.piece1_len, self.detail_len-self.piece1_len, \
                                int(time.time()) - head_dict['timestamp'], len(link_item_list), \
                                ret['failnum'], data[0:self.piece1_len - 1], time.time() - t, self.fileno, self.blockid)
                    else:
                        log_message = 'link_item_list empty. uno[%u] host[%s] delay[%u] p1l[%u] dlen[%u] url_count[%u] filted to zero. fno:%u bid:%u' \
                                % (head_dict['url_no'], head_dict['host'], int(time.time()) - head_dict['timestamp'], self.piece1_len, \
                                self.detail_len, len(url_set), self.fileno, self.blockid)
                    logging.info(log_message)
                    #print 'insert time_cost: %.3f' % (time.time() - t,)

                    #charset_start = page_content.find('charset=')
                    #charset_end   = page_content[charset_start:].find('"')
                    #charset = page_content[charset_start+8 : charset_start+charset_end]
                except:
                    logging.warning('shit happens in url. jsonstr:%s fno[%u] bid[%u]' % \
                        (data[0:self.piece1_len - 1], self.fileno, self.blockid,))
            except:
                logging.warning('unzip fail jsonstr:%s fno:%u bid:%u' % (data[0:self.piece1_len - 1], self.fileno, self.blockid,))
        else:
            logging.warning('shit happens. jsonstr:%s fno[%u] bid[%u]' % (data[0:self.piece1_len-1],self.fileno,self.blockid,))
            pass
            # TODO got some error
            #sys.exit(1)
        self.send_response()
示例#26
0
#!/usr/bin/env python3

# -*- coding: utf-8 -*-

import sys
import time

from utils import read_last_email, get_links, logger, Requester, auth_3dhub
from config import NAME_3DHUBS, PASSWORD_3DHUBS

if __name__ == "__main__":
    logger.info("Start '{}' script.".format(sys.argv[0]))
    logger.debug("Read last email.")
    message_body = read_last_email()
    links_from_message = get_links(message_body)

    ignore_links_from_message = True
    if ignore_links_from_message and len(links_from_message) == 0:
        logger.error("There are no links in email body.")
        sys.exit(1)

    requester = Requester()
    logger.debug("Authorizate via account '{}'.".format(NAME_3DHUBS))
    auth_3dhub(requester, NAME_3DHUBS, PASSWORD_3DHUBS)
    time.sleep(1)
    if not ignore_links_from_message:
        link = links_from_message[0]
        logger.debug("Go to link [{}].".format(link))
        requester.get(link)
        sys.exit(0)
    logger.debug("Go to 'My orders'.")