def curl_cancel_crawl(site_id): s = Site.objects.get(pk=site_id) if s: url = ec2.getInstanceFromInstanceName(s.instance).ip_address params = urllib.urlencode({'project': 'deployable', 'job' : s.jobid}) ret = curl(url, "/cancel.json", "POST", params) return ret return None
def curl_schedule_crawl(site_id, crawler_instance='i-260aa82e'): url = ec2.getInstanceFromInstanceName(crawler_instance).ip_address dl = Site.objects.get(pk=site_id).depthlimit print dl params = urllib.urlencode({'project': 'deployable', 'spider': 'SpiderAll', 'id' : site_id}) params = params + '&setting=DEPTH_LIMIT=' + str(dl) print params ret = curl(url, "/schedule.json", "POST", params) return ret
def get_job_status_count_for_instance(crawler_instance): """ Gets the status count in instance return dict d['running'] ; d['finished'] ; d['pending'] """ url = ec2.getInstanceFromInstanceName(crawler_instance).ip_address ret = curl(url, "/listjobs.json?project=deployable","GET") inv_map = {} try: for k,v in ret.items(): if k != 'status': inv_map.update({ k : len(v) }) except: inv_map = None return inv_map
def get_jobs_for_instance(crawler_instance='i-260aa82e'): """ Gets the current jobs in the instance return json id:{ status, start_time, end_time, instance, site_id} } """ url = ec2.getInstanceFromInstanceName(crawler_instance).ip_address ret = curl(url, "/listjobs.json?project=deployable","GET") inv_map = {} try: for k,v in ret.items(): if k != 'status': for j in v: inv_map.update({ j['id'] : { 'status':k, 'instance' : crawler_instance, } }) except: inv_map = None return inv_map
def deploy(): ret = None for i in ec2.getCrawlerInstances(): if not i.ip_address: continue print "[%s] %s" % (i.id, i.ip_address) ssh_client = sshclient_from_instance(ec2.getInstanceFromInstanceName(i.id), host_key_file = '/home/ec2-user/.ssh/known_hosts', ssh_key_file=keys.aws_pem,user_name='ec2-user') #ssh_client = sshclient_from_instance(ec2.getInstanceFromInstanceName(i.id), host_key_file = '/home/ec2-user/.ssh/known_hosts', ssh_key_file="",user_name='ec2-user') ssh_client.put_file('/home/ec2-user/bblio/scraper/scrapyd.conf','/home/ec2-user/scrapyd.conf') home_dir = '/home/ec2-user/bblio/' copyList = [] copyList.append(home_dir + 'build/search/models.py') copyList.append(home_dir + 'build/search/__init__.py') copyList.append(home_dir + 'build/Build/__init__.py') copyList.append(home_dir + 'build/Build/settings.py.crawler') copyList.append(home_dir + 'build/Build/myScript.py.crawler') copyList.append(home_dir + 'build/manage.py') copyList.append(home_dir + 'build/__init__.py') copyList.append(home_dir + 'aws/ec2.py') copyList.append(home_dir + 'aws/keys.py') copyList.append(home_dir + 'aws/key.pem') copyList.append(home_dir + 'aws/__init__.py') copyList.append(home_dir + 'config_file.py') copyList.append(home_dir + '__init__.py') dirList = [] for c in copyList: c_dir = os.path.dirname(c) prev_dir = '' while c_dir != prev_dir and c_dir not in home_dir: if c_dir not in dirList: dirList.append(c_dir) prev_dir = c_dir c_dir = os.path.dirname(c_dir) dirList.append(home_dir) dirList.sort(lambda x,y: cmp(len(x), len(y))) for d in dirList: print('[dir][%s] %s' % (ssh_client.server.instance_id, d)) ssh_client.run('mkdir %s' % d) for c in copyList: print('[file][%s] %s' % (ssh_client.server.instance_id, c)) ssh_client.put_file(c,c.replace('.crawler','')) with open("/home/ec2-user/bblio/scraper/deployable/scrapy.cfg", "w") as f: f.write( """ [settings] default = deployable.settings [deploy] project = deployable\n """ ) f.write("url = http://") f.write(i.ip_address) f.write(":6800") print i.ip_address p = Popen(['scrapyd-deploy'],stdout=PIPE,shell=True,cwd='/home/ec2-user/bblio/scraper/deployable') j = None while True: out = p.stdout.read() if out == '' and p.poll() != None: break if out != '': if '{' in out: j = out j = json.loads(out) sys.stdout.write(out) sys.stdout.flush() #if j['status'] != 'ok': #ret = ret + str(i.ip_address) + ' failed\n' return ret