示例#1
0
def user_profile(uid):
    uid = uid.strip()
    if not uid:
        return json.dumps({
            'msg': 'uid is required'
        }),400

    s = spider.start_session()
    user = spider.profile(s, uid)

    if 'uid' not in user.keys():
        return json.dumps({
            'msg': 'user uid {} not exist'.format(uid)
        }),404

    if int(user['fans_count']) > count_limit or int(user['follow_count']) > count_limit:
        return json.dumps({
            'msg': 'user fans({}) or follows({}) count limit exceed'.format(user['fans_count'], int(user['follow_count']))
        }),403

    res = spider.crawl(s, uid)
    with_addr = request.args.get('with_addr', '')

    if with_addr and int(with_addr) is 1:
        res = spider.with_addr(s, res)

    return json.dumps({
        'user': user,
        'circle': res
    }),200
示例#2
0
def main():
    # Set arguments
    args = sys.argv[1:]

    # Check how to crawl decision
    if len(args) == 1:
        url = args[0]
        print("Started crawling " + url + " domain")
        spider = mySpider()
        spider.crawl(url, float('inf'))
        print("Finished crawling " + url)
    elif len(args) == 2:
        url = args[0]
        maxFiles = int(args[1])
        print("Started crawling " + url + " domain")
        spider = mySpider()
        spider.crawl(url, maxFiles)
        print("Finished crawling " + url)
    else:
        print_usage()
示例#3
0
def crawl_menu():
    print('Enter number of start links')
    n = int(input())
    li = []
    for i in range(n):
        print(f'enter link number {i}')
        li.append(input())
    print('enter number of docs')
    count = int(input())
    print('enter output path')
    path = input()
    print(
        'are you sure you want to crawl? it might take a few minutes (enter yes for crawling, anything else for stopping)'
    )
    should_crawl = input().lower() == 'yes'
    if not should_crawl:
        return

    t0 = time()
    crawl(li, crawl_limit=count, output_path=path)
    t1 = time()
    total_time = t1 - t0
    print(f'finished crawling! It took {total_time} seconds')
def generate_graph(start_link_list, n_link_to_follow = 2000, verbose = True):

    graph = None
    pid = os.getpid()
    i = 1
    num_links = len(start_link_list)

    if verbose:
        print(str(pid) + " --- must crawl " + str(num_links) + " links.")
    for link in start_link_list:
        link = link.rstrip()
        graph = crawl(link, n_link_to_follow, graph, pid)
        i += 1
        if verbose:
            print(str(pid) + " --- Crawled " + str(i) + " links out of " + str(num_links))

    write_graph(graph, 'graph_generated_by_'+str(pid))
示例#5
0
def crawler_task():
    print('Starting crawl task')
    for region in list(REGION_TABLES.keys()):
        crawl(region, settings.PROCESS_COUNT)
示例#6
0
def crawl_proxy(index):
    crawl(index)
    return jsonify({
        "status": 0,
        "msg": "ok"
    })
示例#7
0
from spider import search, crawl
from utilities import cli
import sys


if __name__ == '__main__':
    key_word = input('Enter comic name: ')
    comics = search(key_word)
    if not comics:
        print('No item found')
        sys.exit()

    name = cli.select(comics.keys())
    comic, link = name, comics[name]
    ready = cli.confirm(f'Download {comic} now?')
    if ready:
        crawl(comic, link)

    print('DONE!')