示例#1
0
def curl_cancel_crawl(site_id):
    s = Site.objects.get(pk=site_id)
    if s:
        url = ec2.getInstanceFromInstanceName(s.instance).ip_address
        params = urllib.urlencode({'project': 'deployable', 'job' : s.jobid})
        ret = curl(url, "/cancel.json", "POST", params)
        return ret
    return None
示例#2
0
def curl_schedule_crawl(site_id, crawler_instance='i-260aa82e'):
    url = ec2.getInstanceFromInstanceName(crawler_instance).ip_address
    dl = Site.objects.get(pk=site_id).depthlimit
    print dl
    params = urllib.urlencode({'project': 'deployable', 'spider': 'SpiderAll', 'id' : site_id})
    params = params + '&setting=DEPTH_LIMIT=' + str(dl)
    print params
    ret = curl(url, "/schedule.json", "POST", params)
    return ret
示例#3
0
def get_job_status_count_for_instance(crawler_instance):
    """
    Gets the status count in instance
    return dict 
    d['running'] ; d['finished'] ; d['pending']
    """
    url = ec2.getInstanceFromInstanceName(crawler_instance).ip_address
    ret = curl(url, "/listjobs.json?project=deployable","GET")
    inv_map = {}
    try:
        for k,v in ret.items():
            if k != 'status':
                inv_map.update({ k : len(v) })
    except:
        inv_map = None
    return inv_map
示例#4
0
def get_jobs_for_instance(crawler_instance='i-260aa82e'):
    """
    Gets the current jobs in the instance
    return json
    id:{
        status, start_time, end_time, instance, site_id}
        }
    """
    url = ec2.getInstanceFromInstanceName(crawler_instance).ip_address
    ret = curl(url, "/listjobs.json?project=deployable","GET")
    inv_map = {}
    try:
        for k,v in ret.items():
            if k != 'status':
                for j in v:
                    inv_map.update({
                        j['id'] : {
                            'status':k,
                            'instance' : crawler_instance,
                            }
                        })
    except:
        inv_map = None
    return inv_map
示例#5
0
def deploy():
    ret = None
    for i in ec2.getCrawlerInstances():
        if not i.ip_address:
            continue
        print "[%s] %s" % (i.id, i.ip_address)
                
        ssh_client = sshclient_from_instance(ec2.getInstanceFromInstanceName(i.id), host_key_file = '/home/ec2-user/.ssh/known_hosts', ssh_key_file=keys.aws_pem,user_name='ec2-user')
        #ssh_client = sshclient_from_instance(ec2.getInstanceFromInstanceName(i.id), host_key_file = '/home/ec2-user/.ssh/known_hosts', ssh_key_file="",user_name='ec2-user')
        ssh_client.put_file('/home/ec2-user/bblio/scraper/scrapyd.conf','/home/ec2-user/scrapyd.conf')

        home_dir = '/home/ec2-user/bblio/'

        copyList = []
        copyList.append(home_dir + 'build/search/models.py')
        copyList.append(home_dir + 'build/search/__init__.py')
        copyList.append(home_dir + 'build/Build/__init__.py')
        copyList.append(home_dir + 'build/Build/settings.py.crawler')
        copyList.append(home_dir + 'build/Build/myScript.py.crawler')
        copyList.append(home_dir + 'build/manage.py')
        copyList.append(home_dir + 'build/__init__.py')
        copyList.append(home_dir + 'aws/ec2.py')
        copyList.append(home_dir + 'aws/keys.py')
        copyList.append(home_dir + 'aws/key.pem')
        copyList.append(home_dir + 'aws/__init__.py')
        copyList.append(home_dir + 'config_file.py')
        copyList.append(home_dir + '__init__.py')

        dirList = []

        for c in copyList:
            c_dir = os.path.dirname(c)
            prev_dir = ''
            while c_dir != prev_dir and c_dir not in home_dir:
                if c_dir not in dirList:
                    dirList.append(c_dir)
                prev_dir = c_dir
                c_dir = os.path.dirname(c_dir)
        dirList.append(home_dir)
        dirList.sort(lambda x,y: cmp(len(x), len(y)))

        for d in dirList:
            print('[dir][%s] %s' % (ssh_client.server.instance_id, d))
            ssh_client.run('mkdir %s' % d)

        for c in copyList:
            print('[file][%s] %s' % (ssh_client.server.instance_id, c))
            ssh_client.put_file(c,c.replace('.crawler',''))

        with open("/home/ec2-user/bblio/scraper/deployable/scrapy.cfg", "w") as f:
            f.write(
"""
[settings]
default = deployable.settings    
[deploy]
project = deployable\n
"""
            )
            f.write("url = http://")
            f.write(i.ip_address)
            f.write(":6800")
            print i.ip_address
        p = Popen(['scrapyd-deploy'],stdout=PIPE,shell=True,cwd='/home/ec2-user/bblio/scraper/deployable')
        j = None

        while True:
            out = p.stdout.read()
            if out == '' and p.poll() != None:
                break
            if out != '':
                if '{' in out:
                    j = out
                    j = json.loads(out)
                sys.stdout.write(out)
                sys.stdout.flush()
        #if j['status'] != 'ok':
            #ret = ret + str(i.ip_address) + ' failed\n'
    return ret