def user_profile(uid): uid = uid.strip() if not uid: return json.dumps({ 'msg': 'uid is required' }),400 s = spider.start_session() user = spider.profile(s, uid) if 'uid' not in user.keys(): return json.dumps({ 'msg': 'user uid {} not exist'.format(uid) }),404 if int(user['fans_count']) > count_limit or int(user['follow_count']) > count_limit: return json.dumps({ 'msg': 'user fans({}) or follows({}) count limit exceed'.format(user['fans_count'], int(user['follow_count'])) }),403 res = spider.crawl(s, uid) with_addr = request.args.get('with_addr', '') if with_addr and int(with_addr) is 1: res = spider.with_addr(s, res) return json.dumps({ 'user': user, 'circle': res }),200
def main(): # Set arguments args = sys.argv[1:] # Check how to crawl decision if len(args) == 1: url = args[0] print("Started crawling " + url + " domain") spider = mySpider() spider.crawl(url, float('inf')) print("Finished crawling " + url) elif len(args) == 2: url = args[0] maxFiles = int(args[1]) print("Started crawling " + url + " domain") spider = mySpider() spider.crawl(url, maxFiles) print("Finished crawling " + url) else: print_usage()
def crawl_menu(): print('Enter number of start links') n = int(input()) li = [] for i in range(n): print(f'enter link number {i}') li.append(input()) print('enter number of docs') count = int(input()) print('enter output path') path = input() print( 'are you sure you want to crawl? it might take a few minutes (enter yes for crawling, anything else for stopping)' ) should_crawl = input().lower() == 'yes' if not should_crawl: return t0 = time() crawl(li, crawl_limit=count, output_path=path) t1 = time() total_time = t1 - t0 print(f'finished crawling! It took {total_time} seconds')
def generate_graph(start_link_list, n_link_to_follow = 2000, verbose = True): graph = None pid = os.getpid() i = 1 num_links = len(start_link_list) if verbose: print(str(pid) + " --- must crawl " + str(num_links) + " links.") for link in start_link_list: link = link.rstrip() graph = crawl(link, n_link_to_follow, graph, pid) i += 1 if verbose: print(str(pid) + " --- Crawled " + str(i) + " links out of " + str(num_links)) write_graph(graph, 'graph_generated_by_'+str(pid))
def crawler_task(): print('Starting crawl task') for region in list(REGION_TABLES.keys()): crawl(region, settings.PROCESS_COUNT)
def crawl_proxy(index): crawl(index) return jsonify({ "status": 0, "msg": "ok" })
from spider import search, crawl from utilities import cli import sys if __name__ == '__main__': key_word = input('Enter comic name: ') comics = search(key_word) if not comics: print('No item found') sys.exit() name = cli.select(comics.keys()) comic, link = name, comics[name] ready = cli.confirm(f'Download {comic} now?') if ready: crawl(comic, link) print('DONE!')