def test_status_while_still_executing(self, executor): self.mock_executor.return_value.isalive.return_value = True subject = Curl('www.yahoo.com', self.mock_executor) result = subject.status() assert result == 'curl www.yahoo.com is still waiting for a response'
def crawling(self): curl = Curl() curl.set_url(self.url) with open(self.filname, "wb") as output: curl.set_option(pycurl.WRITEFUNCTION, output.write) curl.get() curl.close()
class Crawler: def __init__(self, url, *args, **kwargs): self.url = url self._url_lists = [] self.initialize(*args, **kwargs) def initialize(self, *args, **kwargs): pass @property def url_lists(self): return self._url_lists def setUp(self): self.curl = Curl() def tearDown(self): self.curl.close() def grab(self): self.setUp() self.crawling() self.tearDown() def crawling(self): curl = self.curl curl.set_url(self.url) body = curl.get() print(body)
def crawlCraigz(iterations): global cities global links if get_counter(0) is not 0 and iterations*120 != get_counter(0): return cl = Curl(base_url="", fakeheaders=[ 'Cookie: cl_b=5GEZ9Y0F6RGXNBZ5tq5GrwngXVs; cl_def_hp=dallas', ]) page = cl.get("http://dallas.craigslist.org/search/roo",{ 's': get_counter(120), 'search_distance': 13, 'postal': 75214, 'min_price': 400, 'max_price': 600, 'availabilityMode': 0, }) doc = lhtml.document_fromstring(page) for l in doc.iterlinks(): for c in cities: linktext = l[2] linktext = linktext[14::] if c in str(l[0].text) or c.lower() in linktext: links.append(l[2]+'\n') print(l[2]) return crawlCraigz(iterations)
class Crawler: def __init__(self,url,*args,**kwargs): self.url = url self._url_lists = [] self.initialize(*args,**kwargs) def initialize(self,*args,**kwargs): pass @property def url_lists(self): return self._url_lists def setUp(self): self.curl = Curl() def tearDown(self): self.curl.close() def grab(self): self.setUp() self.crawling() self.tearDown() def crawling(self): curl = self.curl curl.set_url(self.url) body = curl.get() print(body)
def test_status_when_already_done_and_queried(self, executor): subject = Curl('www.yahoo.com', self.mock_executor) subject.result = 'result' subject.done = True result = subject.status() assert result == 'result'
def get_zip(self, url, filename): fp = open(filename, "wb") c = Curl() c.get(url, ) c.set_option(c.WRITEDATA, fp) c.perform() c.close() fp.close()
def test_status_with_77_exit_code(self, executor): self.mock_executor.return_value.isalive.return_value = False self.mock_executor.return_value.exitstatus = 77 subject = Curl('www.yahoo.com', self.mock_executor) result = subject.status() assert subject.done is True assert subject.result == 'curl www.yahoo.com (77: SSL public key does not matched pinned public key) FAILED' assert result == 'curl www.yahoo.com (77: SSL public key does not matched pinned public key) FAILED'
def check_projects(self,projects): url = 'curl http://localhost:6800/listprojects.json' curl = Curl(url) res = curl.get() rlt = json.loads(res) for r in rlt['projects']: if r == 'soufun_s2': return True return False
def saveFile2Local(self,url): self.getFileNameByUrl(url) if self.filename: with open(self.filename,"wb") as output: curl = Curl() curl.set_url(url) curl.set_option(pycurl.WRITEFUNCTION,output.write) curl.get() curl.close() Log4Spider.downLog(self,"downloaded a file:[[[",self.filename,"]]]")
def test_status_with_unknown_exit_code(self, executor): self.mock_executor.return_value.isalive.return_value = False self.mock_executor.return_value.exitstatus = 78 subject = Curl('www.yahoo.com', self.mock_executor) result = subject.status() assert subject.done is True assert subject.result == 'curl www.yahoo.com (78: unknown) FAILED' assert result == 'curl www.yahoo.com (78: unknown) FAILED'
def test_status_with_1_exit_code(self, executor): self.mock_executor.return_value.isalive.return_value = False self.mock_executor.return_value.exitstatus = 1 subject = Curl('www.yahoo.com', self.mock_executor) result = subject.status() assert subject.done is True assert subject.result == 'curl www.yahoo.com (1: Unsupported protocol. This build of curl has no support for this protocol.) FAILED' assert result == 'curl www.yahoo.com (1: Unsupported protocol. This build of curl has no support for this protocol.) FAILED'
def test_status_with_successful_exit_code(self, executor): self.mock_executor.return_value.isalive.return_value = False self.mock_executor.return_value.exitstatus = 0 subject = Curl('www.yahoo.com', self.mock_executor) result = subject.status() assert subject.done is True assert subject.result == 'curl www.yahoo.com (0: Success) SUCCEEDED' assert result == 'curl www.yahoo.com (0: Success) SUCCEEDED'
def crawling(self): curl = Curl() curl.set_url(self.url) with open(self.filname,"wb") as output: curl.set_option(pycurl.WRITEFUNCTION,output.write) curl.get() curl.close()
def schedule_spiders(self,project_name,spider_name): url = 'http://localhost:6800/schedule.json' curl = Curl(url) dic = {'project':project_name,'spider':spider_name} res = curl.post(dic) rlt = json.loads(res) if rlt['status'] == 'ok': return True else: return False
def perform(self): curl = Curl( str(self.url) ) curl.perform(False) if curl.getHttpReturnCode() == 200: #table_re_obj = re.compile(r"<table.+?bgcolor=#b18a02.+?>.*</table>", re.S) #row_re_obj = re.compile(r"<tr.+?bgcolor=#ffffff.+?></tr>", re.S) data = [] field_re_obj = re.compile(r"<font color=#0262cd>(\S+)</font>", re.S) items = field_re_obj.findall(curl.getHttpContent()) for i in range(0, len(items), 3): data.append( items[i:i+3] ) self.__dump(data)
def run(self): f = open("/tmp/daemon-log","w") while True: res = self.check_spiders('soufun_agent') if res: f.write('%s' % res) else: f.write('starting the soufun_agent_spider\n') url = 'http://localhost:6800/schedule.json' curl =Curl(url) dic = {'project':'soufun_s2','spider':'soufun_agent'} res = curl.post(dic) f.flush() time.sleep(10)
def get_integration_params(self): headers = {'content-type': 'application/json'} url = self._url + "/api/v3/integrations/" + self._integration_name curl = Curl('GET', url, headers=headers, auth=self._auth) if len(curl.errors) > 0: raise Exception(curl.errors) return curl.jsonData
def curl(self, curl_what=CURL_SITE): # TODO prompt for the url, default if none given # TODO move all user interaction stuff out into command files. curl_count = -1 while curl_count < 0: user_count = raw_input( "How many curls for %s? (blank line to cancel) " % self.name).strip() if user_count == "": return try: curl_count = int(user_count) except ValueError: print("%s is not a number" % user_count) continue if curl_count < 0: print("%i is not a valid number" % curl_count) print("\tstarting curl on %s..." % self.name) for idx in range(curl_count): print("%s sending curl %s #%i" % (time.ctime(), curl_what, idx)) self.traffic_handles.append( Curl(curl_what, self.traffic_commands.curl)) time.sleep(2) print("\tdone.")
def get_channel_by_field_name_and_trackor_id(self, field_name, tid): url = self._url + "/api/v3/trackors/" + str( tid) + "?fields=" + field_name curl = Curl('GET', url, headers=self._headers, auth=self._auth) if len(curl.errors) > 0: raise Exception(curl.errors) return curl.jsonData[field_name]
def get_notif_queue(self): url = self._url + "/api/internal/notif/queue?service_id=" + str( self._service_id) curl = Curl('GET', url, headers=self._headers, auth=self._auth) if len(curl.errors) > 0: raise Exception(curl.errors) return curl.jsonData
def container_list(self, node_ip, node_port): # print(node_ip) # print(node_port) url = 'http://' + node_ip + ":" + node_port + "/containers/json?all=1" container_url = Curl(url) ret_json = container_url.get_value() con_data = {} container_id = [] if ret_json: for i in ret_json: container_id.append(i['Id'][0:12]) else: return con_data if len(container_id) < 1: return con_data else: con_data = {} con_num = 1 for con_id in container_id: tmp_dict = {} ret_json = self._container_detail(node_ip, node_port, con_id) if len(ret_json) < 1: return con_data con_state = "" if ('Running' in ret_json['State'].keys()) and ( 'Status' not in ret_json['State'].keys()): # for docker 1.7 con_state = str(ret_json['State']['Running']) elif 'Status' in ret_json['State'].keys( ): # for docker 1.9 and higher con_state = str(ret_json['State']['Status']) else: # for else con_state = "Exited" tmp_dict['id_num'] = ret_json['Id'][0:12] tmp_dict['con_ip'] = ret_json['NetworkSettings']['IPAddress'] tmp_dict['name'] = ret_json['Name'] tmp_dict['cpuperiod'] = ret_json['HostConfig']['CpuPeriod'] tmp_dict['cpuquota'] = ret_json['HostConfig']['CpuQuota'] tmp_dict['memory'] = ret_json['HostConfig']['Memory'] tmp_dict['state'] = con_state tmp_dict['cmd'] = str(ret_json['Config']['Cmd']) tmp_dict['created'] = ret_json['State']['StartedAt'] con_data[con_num] = tmp_dict con_num += 1 return con_data
def perform(self): curl = Curl( str(self.url) ) curl.perform(False) if curl.getHttpReturnCode() == 200: value_re_obj = re.compile(r"\"(.+?)\"", re.S) match = value_re_obj.search(curl.getHttpContent()) if match: fields = match.groups()[0].split(",") self.open_price = float( fields[1].strip() ) self.close_price = float( fields[2].strip() ) self.cur_price = float( fields[3].strip() ) self.high_price = float( fields[4].strip() ) self.low_price = float( fields[5].strip() ) return True else: return False else: return False
def get_users_by_ids(self, user_ids): user_ids = list(set(user_ids)) url = self._url + "/api/internal/users?user_ids=" url = url + ','.join([str(user_id) for user_id in user_ids]) curl = Curl('GET', url, headers=self._headers, auth=self._auth) if len(curl.errors) > 0: raise Exception(curl.errors) return curl.jsonData
def get_phone_number_by_field_name_and_trackor_id(self, field_name, tid): url = self._url + "/api/v3/trackors/" + str( tid) + "?fields=" + field_name curl = Curl('GET', url, headers=self._headers, auth=(self._username, self._password)) if len(curl.errors) > 0: raise Exception(curl.errors) return curl.jsonData[field_name]
def add(self, log_level, message, description=""): if log_level.log_level_id <= self._ov_log_level.log_level_id: parameters = {'message': message, 'description': description, 'log_level_name': log_level.log_level_name} json_data = json.dumps(parameters) headers = {'content-type': 'application/json'} url_log = self._url + "/api/v3/integrations/runs/" + str(self._process_id) + "/logs" curl = Curl('POST', url_log, data=json_data, headers=headers, auth=self._auth) if len(curl.errors) > 0: raise Exception(curl.errors) return curl.jsonData
def node_list(self, node_ip, node_port): node_dict = {} url = 'http://' + node_ip + ":" + node_port + "/info" node_curl = Curl(url) ret_json = node_curl.get_value() if len(ret_json) < 1: print("Can not get node info") else: node_dict['state'] = '1' node_dict['cpus'] = ret_json['NCPU'] node_dict['mem'] = ret_json['MemTotal'] node_dict['name'] = ret_json['Name'] node_dict['os_version'] = ret_json['OperatingSystem'] node_dict['kernel_version'] = ret_json['KernelVersion'] node_dict['images'] = ret_json['Images'] node_dict['containers'] = ret_json['Containers'] return node_dict
def update_specific_trackor(self, trackor_id, data): headers = {'content-type': 'application/json'} url = 'https://{ov_url}/api/v3/trackors/{trackor_id}'.format( ov_url=self._ov_auth.url, trackor_id=trackor_id) fields = self._field_mappings.get_ready_fields_mapping(data) curl = Curl('PUT', url, headers=headers, auth=HTTPBearerAuth(self._ov_auth.access_key, self._ov_auth.secret_key), data=json.dumps(fields)) if len(curl.errors) > 0: raise Exception(curl.errors) return curl.jsonData
print response_cnt print os_err_cnt for header, cnt in header_cnts.items(): print cnt print "total time: %s" % timedelta(seconds=(t2-t1)) sys.exit(1) # Pre-allocate a list of curl objects m = pycurl.CurlMulti() m.handles = [] m.task_num = 0 for i in range(config.getint('client','Connection')): c = Curl(i, config) m.handles.append(c) try: logs = w3c_log(config.get('access log', 'host'), config.get('access log', 'file')) except Exception, e: print "Error: failed to open log", e exit(1) # Main loop freelist = m.handles[:] t1 = time.time() state = 'start' while state != 'finished': # If there is an url to process and a free curl object, add to multi stack while freelist:
__author__ = 'zhangxa' from curl import Curl import pycurl from html.parser import HTMLParser from htmlParser.htmlParser import UrlHtmlParser from download.downFile import DownFile from urlHandler.urlHandler import UrlBaseHandler from urlQueue.urlQueue import UrlQueue start_url = "http://www.pcgames.com.cn/" c = Curl() c.set_url(start_url) data = c.get() info = c.info() #print(info) def get_charset(c_type): charset=None try: if c_type and 'charset' in c_type: start = c_type.find('charset=') charset_str = c_type[start:] end = charset_str.find(' ') if end > -1: charset = charset_str[len('charset='):end] else: charset = charset_str[len('charset='):] except: return 'UTF-8'
def setUp(self): self.curl = Curl()
def _container_detail(self,node_ip,node_port,container_id): url = 'http://%s:%s/containers/%s/json' % (node_ip,node_port,container_id) container_more_curl = Curl(url) ret_json = container_more_curl.get_value() return ret_json
def fetch_url(url, nobody=0, timeout=30, follow_redirect=0, agent=USER_AGENT): """Fetch url using curl :param url: :param nobody: :param timeout: :param follow_redirect: :param agent: """ t = io.StringIO() c = Curl() s = r"%s" % (url) c.set_option(pycurl.USERAGENT, agent) c.set_option(pycurl.URL, s.encode('utf-8')) c.set_option(pycurl.NOBODY, nobody) c.set_option(pycurl.FOLLOWLOCATION, follow_redirect) c.set_option(pycurl.WRITEFUNCTION, t.write) c.set_option(pycurl.TIMEOUT, timeout) attempt = 0 try: c.get() except: return (None, None) return (c, t)
def add_new_attempt(self, notif_queue_rec_id, error_message): url = self._url + "/api/internal/notif/queue/" + str( notif_queue_rec_id) + "/attempts?error_code=" + error_message curl = Curl('POST', url, headers=self._headers, auth=self._auth) if len(curl.errors) > 0: raise Exception(curl.errors)
__author__ = 'zhangxa' from curl import Curl import pycurl from html.parser import HTMLParser from htmlParser.htmlParser import UrlHtmlParser from download.downFile import DownFile from urlHandler.urlHandler import UrlBaseHandler from urlQueue.urlQueue import UrlQueue start_url = "http://www.pcgames.com.cn/" c = Curl() c.set_url(start_url) data = c.get() info = c.info() #print(info) def get_charset(c_type): charset = None try: if c_type and 'charset' in c_type: start = c_type.find('charset=') charset_str = c_type[start:] end = charset_str.find(' ') if end > -1: charset = charset_str[len('charset='):end] else: charset = charset_str[len('charset='):] except:
from curl import Curl curl = Curl() url = 'http://www.baidu.com/' res = curl.get() print res
def update_notif_queue_rec_status(self, notif_queue_rec_id, status): url = self._url + "/api/internal/notif/queue/" + str( notif_queue_rec_id) + "/update_status?status=" + status curl = Curl('PATCH', url, headers=self._headers, auth=self._auth) if len(curl.errors) > 0: raise Exception(curl.errors)
from curl import Curl import os import sys if __name__ == "__main__": if len(sys.argv) < 2: url = 'http://curl.haxx.se' else: url = sys.argv[1] c = Curl() c.get(url) print c.body() print '=' * 74 + '\n' import pprint pprint.pprint(c.info()) print c.get_info(pycurl.OS_ERRNO) print c.info()['os-errno'] c.close()
if self.infos == 5 and 'beliked' not in self.info.keys(): self.info['beliked'] = int(data) def handle_endtag(self, tag): if tag == "h3": self.h3 = 0 if self.clearfix and tag == "ul": self.clearfix = 0 if hasattr(self, "infoHook"): self.infoHook(self.info) def handle_startendtag(self, tag, attrs): pass @property def urlList(self): return self.current_urlList() if __name__ == "__main__": parser = JianShuUserInfo_HtmlParser() from curl import Curl import pycurl c = Curl() c.set_url("http://www.jianshu.com/users/d9edcb44e2f2/latest_articles") data = c.get() #parser.setParseFile("parse.txt") parser.setInfoHook(lambda info: print(str(info))) parser.feed(data.decode("utf-8")) parser.close() c.close()
def _container_detail(self, node_ip, node_port, containers_id): url = ('http://' + node_ip + ":" + node_port + "/containers/" + containers_id + "/json") container_more_url = Curl(url) ret_json = container_more_url.get_value() return ret_json