Пример #1
0
    def test_status_while_still_executing(self, executor):
        self.mock_executor.return_value.isalive.return_value = True
        subject = Curl('www.yahoo.com', self.mock_executor)

        result = subject.status()

        assert result == 'curl www.yahoo.com is still waiting for a response'
Пример #2
0
 def crawling(self):
     curl = Curl()
     curl.set_url(self.url)
     with open(self.filname, "wb") as output:
         curl.set_option(pycurl.WRITEFUNCTION, output.write)
         curl.get()
         curl.close()
Пример #3
0
class Crawler:
    def __init__(self, url, *args, **kwargs):
        self.url = url
        self._url_lists = []
        self.initialize(*args, **kwargs)

    def initialize(self, *args, **kwargs):
        pass

    @property
    def url_lists(self):
        return self._url_lists

    def setUp(self):
        self.curl = Curl()

    def tearDown(self):
        self.curl.close()

    def grab(self):
        self.setUp()
        self.crawling()
        self.tearDown()

    def crawling(self):
        curl = self.curl
        curl.set_url(self.url)
        body = curl.get()
        print(body)
Пример #4
0
def crawlCraigz(iterations):
	global cities
	global links
	
	if get_counter(0) is not 0 and iterations*120 != get_counter(0):
		return
	
	cl = Curl(base_url="", fakeheaders=[
		'Cookie: cl_b=5GEZ9Y0F6RGXNBZ5tq5GrwngXVs; cl_def_hp=dallas',
		])
	
	page = cl.get("http://dallas.craigslist.org/search/roo",{
		's': get_counter(120),
		'search_distance': 13,
		'postal': 75214,
		'min_price': 400,
		'max_price': 600,
		'availabilityMode': 0,
		})

	doc = lhtml.document_fromstring(page)
	for l in doc.iterlinks():
		for c in cities:
			linktext = l[2]
			linktext = linktext[14::]
			if c in str(l[0].text) or c.lower() in linktext:
				links.append(l[2]+'\n')
				print(l[2])
	
	return crawlCraigz(iterations)
Пример #5
0
class Crawler:
    def __init__(self,url,*args,**kwargs):
        self.url = url
        self._url_lists = []
        self.initialize(*args,**kwargs)

    def initialize(self,*args,**kwargs):
        pass

    @property
    def url_lists(self):
        return self._url_lists

    def setUp(self):
        self.curl = Curl()

    def tearDown(self):
        self.curl.close()

    def grab(self):
        self.setUp()
        self.crawling()
        self.tearDown()

    def crawling(self):
        curl = self.curl
        curl.set_url(self.url)
        body = curl.get()
        print(body)
Пример #6
0
    def test_status_when_already_done_and_queried(self, executor):
        subject = Curl('www.yahoo.com', self.mock_executor)
        subject.result = 'result'
        subject.done = True

        result = subject.status()

        assert result == 'result'
Пример #7
0
 def get_zip(self, url, filename):
     fp = open(filename, "wb")
     c = Curl()
     c.get(url, )
     c.set_option(c.WRITEDATA, fp)
     c.perform()
     c.close()
     fp.close()
Пример #8
0
    def test_status_with_77_exit_code(self, executor):
        self.mock_executor.return_value.isalive.return_value = False
        self.mock_executor.return_value.exitstatus = 77
        subject = Curl('www.yahoo.com', self.mock_executor)

        result = subject.status()

        assert subject.done is True
        assert subject.result == 'curl www.yahoo.com (77: SSL public key does not matched pinned public key) FAILED'
        assert result == 'curl www.yahoo.com (77: SSL public key does not matched pinned public key) FAILED'
Пример #9
0
    def check_projects(self,projects):

        url = 'curl http://localhost:6800/listprojects.json'
        curl = Curl(url)
        res = curl.get()
        rlt = json.loads(res)
        for r in rlt['projects']:
            if r == 'soufun_s2':
                return True
        return False
Пример #10
0
 def saveFile2Local(self,url):
     self.getFileNameByUrl(url)
     if self.filename:
         with open(self.filename,"wb") as output:
             curl = Curl()
             curl.set_url(url)
             curl.set_option(pycurl.WRITEFUNCTION,output.write)
             curl.get()
             curl.close()
             Log4Spider.downLog(self,"downloaded a file:[[[",self.filename,"]]]")
Пример #11
0
    def test_status_with_unknown_exit_code(self, executor):
        self.mock_executor.return_value.isalive.return_value = False
        self.mock_executor.return_value.exitstatus = 78
        subject = Curl('www.yahoo.com', self.mock_executor)

        result = subject.status()

        assert subject.done is True
        assert subject.result == 'curl www.yahoo.com (78: unknown) FAILED'
        assert result == 'curl www.yahoo.com (78: unknown) FAILED'
Пример #12
0
    def test_status_with_1_exit_code(self, executor):
        self.mock_executor.return_value.isalive.return_value = False
        self.mock_executor.return_value.exitstatus = 1
        subject = Curl('www.yahoo.com', self.mock_executor)

        result = subject.status()

        assert subject.done is True
        assert subject.result == 'curl www.yahoo.com (1: Unsupported protocol. This build of curl has no support for this protocol.) FAILED'
        assert result == 'curl www.yahoo.com (1: Unsupported protocol. This build of curl has no support for this protocol.) FAILED'
Пример #13
0
    def test_status_with_successful_exit_code(self, executor):
        self.mock_executor.return_value.isalive.return_value = False
        self.mock_executor.return_value.exitstatus = 0
        subject = Curl('www.yahoo.com', self.mock_executor)

        result = subject.status()

        assert subject.done is True
        assert subject.result == 'curl www.yahoo.com (0: Success) SUCCEEDED'
        assert result == 'curl www.yahoo.com (0: Success) SUCCEEDED'
Пример #14
0
 def crawling(self):
     curl = Curl()
     curl.set_url(self.url)
     with open(self.filname,"wb") as output:
         curl.set_option(pycurl.WRITEFUNCTION,output.write)
         curl.get()
         curl.close()
Пример #15
0
 def schedule_spiders(self,project_name,spider_name):
     
     url = 'http://localhost:6800/schedule.json'
     curl = Curl(url)
     dic = {'project':project_name,'spider':spider_name}
     res = curl.post(dic)
     rlt = json.loads(res)
     if rlt['status'] == 'ok':
         return True
     else:
         return False
Пример #16
0
 def perform(self):
     curl = Curl( str(self.url) )
     curl.perform(False)
     if curl.getHttpReturnCode() == 200:
         #table_re_obj = re.compile(r"<table.+?bgcolor=#b18a02.+?>.*</table>", re.S)
         #row_re_obj = re.compile(r"<tr.+?bgcolor=#ffffff.+?></tr>", re.S)
         data = []
         field_re_obj = re.compile(r"<font color=#0262cd>(\S+)</font>", re.S)
         items = field_re_obj.findall(curl.getHttpContent())
         for i in range(0, len(items), 3):
             data.append( items[i:i+3] )
         self.__dump(data)
Пример #17
0
 def run(self):
     f = open("/tmp/daemon-log","w")
     while True:
         res = self.check_spiders('soufun_agent')
         if res:
             f.write('%s' % res)
         else:
             f.write('starting the soufun_agent_spider\n')
             url = 'http://localhost:6800/schedule.json'
             curl =Curl(url)
             dic = {'project':'soufun_s2','spider':'soufun_agent'}
             res = curl.post(dic) 
         f.flush()
         time.sleep(10)
 def get_integration_params(self):
     headers = {'content-type': 'application/json'}
     url = self._url + "/api/v3/integrations/" + self._integration_name
     curl = Curl('GET', url, headers=headers, auth=self._auth)
     if len(curl.errors) > 0:
         raise Exception(curl.errors)
     return curl.jsonData
Пример #19
0
    def curl(self, curl_what=CURL_SITE):
        # TODO prompt for the url, default if none given
        # TODO move all user interaction stuff out into command files.
        curl_count = -1
        while curl_count < 0:
            user_count = raw_input(
                "How many curls for %s? (blank line to cancel) " %
                self.name).strip()
            if user_count == "":
                return
            try:
                curl_count = int(user_count)
            except ValueError:
                print("%s is not a number" % user_count)
                continue
            if curl_count < 0:
                print("%i is not a valid number" % curl_count)

        print("\tstarting curl on %s..." % self.name)
        for idx in range(curl_count):
            print("%s sending curl %s #%i" % (time.ctime(), curl_what, idx))
            self.traffic_handles.append(
                Curl(curl_what, self.traffic_commands.curl))
            time.sleep(2)
        print("\tdone.")
Пример #20
0
 def get_channel_by_field_name_and_trackor_id(self, field_name, tid):
     url = self._url + "/api/v3/trackors/" + str(
         tid) + "?fields=" + field_name
     curl = Curl('GET', url, headers=self._headers, auth=self._auth)
     if len(curl.errors) > 0:
         raise Exception(curl.errors)
     return curl.jsonData[field_name]
Пример #21
0
 def get_notif_queue(self):
     url = self._url + "/api/internal/notif/queue?service_id=" + str(
         self._service_id)
     curl = Curl('GET', url, headers=self._headers, auth=self._auth)
     if len(curl.errors) > 0:
         raise Exception(curl.errors)
     return curl.jsonData
Пример #22
0
    def container_list(self, node_ip, node_port):
        # print(node_ip)
        # print(node_port)
        url = 'http://' + node_ip + ":" + node_port + "/containers/json?all=1"
        container_url = Curl(url)
        ret_json = container_url.get_value()

        con_data = {}
        container_id = []
        if ret_json:
            for i in ret_json:
                container_id.append(i['Id'][0:12])
        else:
            return con_data

        if len(container_id) < 1:
            return con_data
        else:
            con_data = {}
            con_num = 1
            for con_id in container_id:
                tmp_dict = {}
                ret_json = self._container_detail(node_ip, node_port, con_id)
                if len(ret_json) < 1:
                    return con_data
                con_state = ""
                if ('Running' in ret_json['State'].keys()) and (
                        'Status'
                        not in ret_json['State'].keys()):  # for docker 1.7
                    con_state = str(ret_json['State']['Running'])
                elif 'Status' in ret_json['State'].keys(
                ):  # for docker 1.9 and higher
                    con_state = str(ret_json['State']['Status'])
                else:  # for else
                    con_state = "Exited"
                tmp_dict['id_num'] = ret_json['Id'][0:12]
                tmp_dict['con_ip'] = ret_json['NetworkSettings']['IPAddress']
                tmp_dict['name'] = ret_json['Name']
                tmp_dict['cpuperiod'] = ret_json['HostConfig']['CpuPeriod']
                tmp_dict['cpuquota'] = ret_json['HostConfig']['CpuQuota']
                tmp_dict['memory'] = ret_json['HostConfig']['Memory']
                tmp_dict['state'] = con_state
                tmp_dict['cmd'] = str(ret_json['Config']['Cmd'])
                tmp_dict['created'] = ret_json['State']['StartedAt']
                con_data[con_num] = tmp_dict
                con_num += 1
        return con_data
Пример #23
0
 def perform(self):
     curl = Curl( str(self.url) )
     curl.perform(False)
     if curl.getHttpReturnCode() == 200:
         value_re_obj = re.compile(r"\"(.+?)\"", re.S)
         match = value_re_obj.search(curl.getHttpContent())
         if match:
             fields = match.groups()[0].split(",")
             self.open_price = float( fields[1].strip() )
             self.close_price = float( fields[2].strip() )
             self.cur_price = float( fields[3].strip() )
             self.high_price = float( fields[4].strip() )
             self.low_price = float( fields[5].strip() )
             return True
         else:
             return False
     else:
         return False
Пример #24
0
    def get_users_by_ids(self, user_ids):
        user_ids = list(set(user_ids))
        url = self._url + "/api/internal/users?user_ids="
        url = url + ','.join([str(user_id) for user_id in user_ids])

        curl = Curl('GET', url, headers=self._headers, auth=self._auth)
        if len(curl.errors) > 0:
            raise Exception(curl.errors)
        return curl.jsonData
 def get_phone_number_by_field_name_and_trackor_id(self, field_name, tid):
     url = self._url + "/api/v3/trackors/" + str(
         tid) + "?fields=" + field_name
     curl = Curl('GET',
                 url,
                 headers=self._headers,
                 auth=(self._username, self._password))
     if len(curl.errors) > 0:
         raise Exception(curl.errors)
     return curl.jsonData[field_name]
 def add(self, log_level, message, description=""):
     if log_level.log_level_id <= self._ov_log_level.log_level_id:
         parameters = {'message': message, 'description': description, 'log_level_name': log_level.log_level_name}
         json_data = json.dumps(parameters)
         headers = {'content-type': 'application/json'}
         url_log = self._url + "/api/v3/integrations/runs/" + str(self._process_id) + "/logs"
         curl = Curl('POST', url_log, data=json_data, headers=headers, auth=self._auth)
         if len(curl.errors) > 0:
             raise Exception(curl.errors)
         return curl.jsonData
Пример #27
0
    def node_list(self, node_ip, node_port):
        node_dict = {}
        url = 'http://' + node_ip + ":" + node_port + "/info"
        node_curl = Curl(url)
        ret_json = node_curl.get_value()

        if len(ret_json) < 1:
            print("Can not get node info")

        else:
            node_dict['state'] = '1'
            node_dict['cpus'] = ret_json['NCPU']
            node_dict['mem'] = ret_json['MemTotal']
            node_dict['name'] = ret_json['Name']
            node_dict['os_version'] = ret_json['OperatingSystem']
            node_dict['kernel_version'] = ret_json['KernelVersion']
            node_dict['images'] = ret_json['Images']
            node_dict['containers'] = ret_json['Containers']
        return node_dict
    def update_specific_trackor(self, trackor_id, data):
        headers = {'content-type': 'application/json'}
        url = 'https://{ov_url}/api/v3/trackors/{trackor_id}'.format(
            ov_url=self._ov_auth.url, trackor_id=trackor_id)
        fields = self._field_mappings.get_ready_fields_mapping(data)

        curl = Curl('PUT',
                    url,
                    headers=headers,
                    auth=HTTPBearerAuth(self._ov_auth.access_key,
                                        self._ov_auth.secret_key),
                    data=json.dumps(fields))
        if len(curl.errors) > 0:
            raise Exception(curl.errors)

        return curl.jsonData
Пример #29
0
    print response_cnt
    print os_err_cnt
    for header, cnt in header_cnts.items():
        print cnt
    print "total time: %s" % timedelta(seconds=(t2-t1))
    sys.exit(1)




# Pre-allocate a list of curl objects
m = pycurl.CurlMulti()
m.handles = []
m.task_num = 0
for i in range(config.getint('client','Connection')):
    c = Curl(i, config)
    m.handles.append(c)

try:
    logs = w3c_log(config.get('access log', 'host'), config.get('access log', 'file'))
except Exception, e:
    print "Error: failed to open log", e
    exit(1)

# Main loop
freelist = m.handles[:]
t1 = time.time()
state = 'start'
while state != 'finished':
    # If there is an url to process and a free curl object, add to multi stack
    while freelist:
Пример #30
0
__author__ = 'zhangxa'

from curl import Curl
import pycurl

from html.parser import HTMLParser
from htmlParser.htmlParser import UrlHtmlParser
from download.downFile import DownFile
from urlHandler.urlHandler import UrlBaseHandler
from urlQueue.urlQueue import UrlQueue

start_url = "http://www.pcgames.com.cn/"
c = Curl()
c.set_url(start_url)
data = c.get()
info = c.info()
#print(info)

def get_charset(c_type):
    charset=None
    try:
        if c_type and 'charset' in c_type:
            start = c_type.find('charset=')
            charset_str = c_type[start:]
            end = charset_str.find(' ')
            if end > -1:
                charset = charset_str[len('charset='):end]
            else:
                charset = charset_str[len('charset='):]
    except:
        return 'UTF-8'
Пример #31
0
 def setUp(self):
     self.curl = Curl()
Пример #32
0
 def _container_detail(self,node_ip,node_port,container_id):
     url = 'http://%s:%s/containers/%s/json' % (node_ip,node_port,container_id)
     container_more_curl = Curl(url)
     ret_json = container_more_curl.get_value()
     return ret_json
Пример #33
0
def fetch_url(url, nobody=0, timeout=30, follow_redirect=0, agent=USER_AGENT):
    """Fetch url using curl
    :param url:
    :param nobody:
    :param timeout:
    :param follow_redirect:
    :param agent:
    """
    t = io.StringIO()
    c = Curl()
    s = r"%s" % (url)
    c.set_option(pycurl.USERAGENT, agent)
    c.set_option(pycurl.URL, s.encode('utf-8'))
    c.set_option(pycurl.NOBODY, nobody)
    c.set_option(pycurl.FOLLOWLOCATION, follow_redirect)
    c.set_option(pycurl.WRITEFUNCTION, t.write)
    c.set_option(pycurl.TIMEOUT, timeout)
    attempt = 0
    try:
        c.get()
    except:
        return (None, None)
    return (c, t)
Пример #34
0
 def add_new_attempt(self, notif_queue_rec_id, error_message):
     url = self._url + "/api/internal/notif/queue/" + str(
         notif_queue_rec_id) + "/attempts?error_code=" + error_message
     curl = Curl('POST', url, headers=self._headers, auth=self._auth)
     if len(curl.errors) > 0:
         raise Exception(curl.errors)
Пример #35
0
__author__ = 'zhangxa'

from curl import Curl
import pycurl

from html.parser import HTMLParser
from htmlParser.htmlParser import UrlHtmlParser
from download.downFile import DownFile
from urlHandler.urlHandler import UrlBaseHandler
from urlQueue.urlQueue import UrlQueue

start_url = "http://www.pcgames.com.cn/"
c = Curl()
c.set_url(start_url)
data = c.get()
info = c.info()
#print(info)


def get_charset(c_type):
    charset = None
    try:
        if c_type and 'charset' in c_type:
            start = c_type.find('charset=')
            charset_str = c_type[start:]
            end = charset_str.find(' ')
            if end > -1:
                charset = charset_str[len('charset='):end]
            else:
                charset = charset_str[len('charset='):]
    except:
Пример #36
0
 def setUp(self):
     self.curl = Curl()
Пример #37
0
from curl import Curl

curl = Curl()
url = 'http://www.baidu.com/'
res = curl.get()
print res
Пример #38
0
 def update_notif_queue_rec_status(self, notif_queue_rec_id, status):
     url = self._url + "/api/internal/notif/queue/" + str(
         notif_queue_rec_id) + "/update_status?status=" + status
     curl = Curl('PATCH', url, headers=self._headers, auth=self._auth)
     if len(curl.errors) > 0:
         raise Exception(curl.errors)
from curl import Curl
import os
import sys
if __name__ == "__main__":
    if len(sys.argv) < 2:
        url = 'http://curl.haxx.se'
    else:
        url = sys.argv[1]
    c = Curl()
    c.get(url)
    print c.body()
    print '=' * 74 + '\n'
    import pprint
    pprint.pprint(c.info())
    print c.get_info(pycurl.OS_ERRNO)
    print c.info()['os-errno']
    c.close()
Пример #40
0
            if self.infos == 5 and 'beliked' not in self.info.keys():
                self.info['beliked'] = int(data)

    def handle_endtag(self, tag):
        if tag == "h3":
            self.h3 = 0
        if self.clearfix and tag == "ul":
            self.clearfix = 0
            if hasattr(self, "infoHook"):
                self.infoHook(self.info)

    def handle_startendtag(self, tag, attrs):
        pass

    @property
    def urlList(self):
        return self.current_urlList()


if __name__ == "__main__":
    parser = JianShuUserInfo_HtmlParser()
    from curl import Curl
    import pycurl
    c = Curl()
    c.set_url("http://www.jianshu.com/users/d9edcb44e2f2/latest_articles")
    data = c.get()
    #parser.setParseFile("parse.txt")
    parser.setInfoHook(lambda info: print(str(info)))
    parser.feed(data.decode("utf-8"))
    parser.close()
    c.close()
Пример #41
0
 def _container_detail(self, node_ip, node_port, containers_id):
     url = ('http://' + node_ip + ":" + node_port + "/containers/" + containers_id + "/json")
     container_more_url = Curl(url)
     ret_json = container_more_url.get_value()
     return ret_json