def classify_link(link): ''' classify link according to its domain ''' if link is None: return link, SITE_TYPE['junk'] original_url = link url = urlparse.urlparse(link) max_try_count = 10 try_count = 0 while url.netloc in _SHORT_SERVICE: if try_count >= max_try_count: # if multiple redirect, return as news return link, SITE_TYPE['news'] #get original link of short link original_url = _get_original_link(original_url) url = urlparse.urlparse(original_url) try_count += 1 domain_token = url.netloc.split('.') length = len(domain_token) - 2 while length >= 0: domain = '.'.join(domain_token[length:]) if domain in _BLACK_SITE_LIST: return original_url, _BLACK_SITE_LIST[domain] length -= 1 #treat unclassified link as news link return original_url, SITE_TYPE['news']
def __init__(self, announce, piece_length=262144, **kw): self.piece_length = piece_length if not bool(urlparse.urlparse(announce).scheme): raise ValueError('No schema present for url') self.tdict = { 'announce': announce, 'creation date': int(time()), 'info': { 'piece length': self.piece_length } } if kw.get('comment'): self.tdict.update({'comment': kw.get('comment')}) if kw.get('httpseeds'): if not isinstance(kw.get('httpseeds'), list): raise TypeError('httpseeds must be a list') else: self.tdict.update({'httpseeds': kw.get('httpseeds')}) if kw.get('announcelist'): if not isinstance(kw.get('announcelist'), list): raise TypeError('announcelist must be a list of lists') if False in [isinstance(l, list) for l in kw.get('announcelist')]: raise TypeError('announcelist must be a list of lists') if False in [ bool(urlparse.urlparse(f[0]).scheme) for f in kw.get('announcelist') ]: raise ValueError('No schema present for url') else: self.tdict.update({'announce-list': kw.get('announcelist')})
def __init__(self, announce, piece_length=262144, **kw): self.piece_length = piece_length if not bool(urlparse.urlparse(announce).scheme): raise ValueError('No schema present for url') self.tdict = { 'announce': announce, 'creation date': int(time()), 'info': { 'piece length': self.piece_length } } if kw.get('comment'): self.tdict.update({'comment': kw.get('comment')}) if kw.get('httpseeds'): if not isinstance(kw.get('httpseeds'), list): raise TypeError('httpseeds must be a list') else: self.tdict.update({'httpseeds': kw.get('httpseeds')}) if kw.get('announcelist'): if not isinstance(kw.get('announcelist'), list): raise TypeError('announcelist must be a list of lists') if False in [isinstance(l, list) for l in kw.get('announcelist')]: raise TypeError('announcelist must be a list of lists') if False in [bool(urlparse.urlparse(f[0]).scheme) for f in kw.get('announcelist')]: raise ValueError('No schema present for url') else: self.tdict.update({'announce-list': kw.get('announcelist')})
def __getParentPage(self): ''' This will get the Parent Page info ''' page = {} try: self.hierarchy = page['et_thread_hierarchy'] = [stripHtml(x.renderContents()) for x in self.soup.find('div',{'class':'rd Microsoft_Msn_Boards_Read_List Web_Bindings_Base'}).findAll('li')] except: log.info(self.log_msg('Thread hierarchy is not found')) try: self.forum_title = page['title'] = stripHtml(self.soup.find('h2').renderContents()) except: log.info(self.log_msg('Title Not Found')) page['title'] = '' if checkSessionInfo(self.genre, self.session_info_out, self.parent_uri, self.task.instance_data.get('update')): log.info(self.log_msg('Session info return True')) return False for each in ['et_author_name','ei_thread_replies_count','ei_thread_view_count','ei_author_count','et_last_post_author','edate_last_post_date','posted_date']: try: page[each] = self.task.pagedata[each] except: log.info(self.log_msg('Page data cannot be extracted for %s'%each)) try: page['ei_thread_id'] = int(urlparse.urlparse(self.currenturi)[4].split('&')[0].split('ThreadId=')[1]) except: log.info(self.log_msg('Thread id not found')) try: post_hash = get_hash(page) id = None if self.session_info_out == {}: id = self.task.id result = updateSessionInfo(self.genre, self.session_info_out, self.parent_uri, post_hash, 'Post', self.task.instance_data.get('update'),Id=id) if not result['updated']: return False page['path'] = [self.parent_uri] page['parent_path'] = [] page['uri'] = normalize(self.currenturi) page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1]) page['priority'] = self.task.priority page['level'] = self.task.level page['pickup_date'] = datetime.strftime(datetime.utcnow(),'%Y-%m-%dT%H:%M:%SZ') page['connector_instance_log_id'] = self.task.connector_instance_log_id page['connector_instance_id'] = self.task.connector_instance_id page['workspace_id'] = self.task.workspace_id page['client_id'] = self.task.client_id page['client_name'] = self.task.client_name page['last_updated_time'] = page['pickup_date'] page['versioned'] = False page['data'] = '' page['task_log_id']=self.task.id page['entity'] = 'Post' page['category']=self.task.instance_data.get('category','') self.pages.append(page) log.info(page) log.info(self.log_msg('Parent Page added')) return True except : log.exception(self.log_msg("parent post couldn't be parsed")) return False
def completeurl(fullurl, partialurl): from urllib2 import urlparse parsed_jobsurl = urlparse.urlparse(fullurl) parsed_joburl = urlparse.urlparse(partialurl) fulljoburl = urlparse.urlunparse([parsed_jobsurl.scheme, parsed_jobsurl.netloc, parsed_joburl.path, parsed_joburl.params, parsed_joburl.query, parsed_joburl.fragment]) return fulljoburl
def convert(self, value, context, ctx_opts): if value[:4] != u'http': value = u'http://%s' % value domain = urlparse.urlparse(value)[1] if not domain or domain == u'': domain = urlparse.urlparse(u'http://%s' % value)[1] if not domain or len(domain.split(u'.')) < 2 or \ len(domain.split(u' ')) > 1: self.error('invalid_domain', value, context, ctx_opts) return domain.lower()
def homepage_url(self): """Try ensure we prepend http: to the url if there's nothing there This is to ensure we're not generating relative links in the user templates.""" if not self.homepage: return self.homepage parsed = urlparse.urlparse(self.homepage) if parsed.scheme: return self.homepage # Vague sanity check abs_url = ''.join(['http://', self.homepage]) if urlparse.urlparse(abs_url).scheme == 'http': return abs_url return self.homepage
def test_execute_async(self, call_rpc_client_mock): with patch.object(ResponseUnavailableViewMixing, 'verify', return_value=None) as mock_method: call_rpc_client_mock.return_value = True self.client1.login(username='******', password='******') from urllib2 import urlparse # get redirect response = self.client1.get(self.base1_apy1.get_exec_url()+"&async") self.assertEqual(301, response.status_code) queries = urlparse.urlparse(response['Location'])[4] rid = int(urlparse.parse_qs(queries)['rid'][0]) transaction = Transaction.objects.get(pk=rid) # get state (RUNNING) #response = self.client1.get(self.base1_apy1.get_exec_url()+"&rid=%s" % rid, HTTP_ACCEPT='application/xml') response = self.client1.get(self.base1_apy1.get_exec_url()+"&rid=%s" % rid) self.assertEqual(200, response.status_code) tout = {u'status': u'RUNNING', "url": "/fastapp/base/base1/exec/base1_apy1/?json=&rid="+str(rid), 'rid': rid, 'id': u'base1_apy1'} self.assertEqual(json.loads(response.content)['rid'], tout['rid']) # mock creation of response tout = {u'status': u'OK', u'exception': None, u'returned': u'{"content": "{\\"aaa\\": \\"aaa\\"}", "class": "XMLResponse", "content_type": "application/json"}', u'response_class': u'JSONResponse', 'time_ms': '74', 'rid': rid, 'id': u'base1_apy1'} transaction.tout = tout transaction.save() self.assertEqual(transaction.apy, self.base1_apy1) # get response response = self.client1.get(self.base1_apy1.get_exec_url()+"&rid=%s" % rid) self.assertEqual(200, response.status_code) # check transaction duration transaction = Transaction.objects.get(pk=rid) self.assertEqual(int, type(transaction.duration))
def send_email(self, to='', subject='', body='', cc='', bcc=''): log.info('sending a mail') data = dict(nvp_bu_send='Send') for name in 'to subject body cc bcc'.split(): if vars()[name]: data[name] = vars()[name].encode('utf-8') if not hasattr(self, 'sendpath'): response = self.internal_http_opener.open(self.internalBaseMailUrl + '?ui=html') from urllib2 import urlparse respurl = urlparse.urlparse(response.geturl()) try: response.close() except: pass del response self.sendpath = respurl.path url = 'https://mail.google.com' + self.sendpath try: at = self.gmail_at except KeyError: at = '' params = dict(at=at, v='b', pv='tl', s='s', fv='b', cpt='c', cs='c') if not self.hosted: params.update(fv='b', cpt='c', cs='c') else: params.update(cs='b', s='s') url = UrlQuery(url, params) response = self.webrequest(url, follow_js_redirects=True, **data) log.info('sent a mail') assert response and ('Your message has been sent.' in response) log.info('send mail success: %r', bool('Your message has been sent.' in response)) return True
def get_document(self, url): """ Connects to the server and retrieves the document """ set_status(_('Contacting SomaFM server...')) hostinfo = urlparse.urlparse(url) try: c = httplib.HTTPConnection(hostinfo.netloc, timeout=20) except TypeError: c = httplib.HTTPConnection(hostinfo.netloc) try: c.request('GET', hostinfo.path, headers={'User-Agent': self.user_agent}) response = c.getresponse() except (socket.timeout, socket.error): raise radio.RadioException(_('Error connecting to SomaFM server.')) if response.status != 200: raise radio.RadioException(_('Error connecting to SomaFM server.')) document = response.read() c.close() set_status('') return document
def on_navigation_requested(self,view,frame,req,data=None): uri = req.get_uri() parse = urlparse.urlparse(uri) if self.url_callback.find(parse.hostname) > 0: self.getAccessToken(parse) return True return False
def search_regulars(self): """ Search urls inside the <A> tags """ urls = set() tree = XPathExtractor().get_object(self.response.raw_html) for link_tag in tree.xpath("//a"): if not 'href' in link_tag.attrib: continue url = link_tag.attrib["href"] if not urlparse.urlparse(url).netloc: url = self._fix_url(url) url = self._normalize_url(url) urls.add(url) return urls
def search1(self, search): url_list = [] #store all the extracted urls in a List title_list = [] #store all the extracted titles in a List description_list = [] #store all the extracted Description in a List for start in range(0,10): page = requests.get('http://www.google.de/search?q='+search+str(start*10), verify = False) soup = BeautifulSoup(page.text,'html.parser') for cite in soup.findAll('a', attrs={'class':'r'}): #extrcat all URLs url = cite.text print url if not urlparse.urlparse(url).scheme: #check if url has prefix http:// or not url = 'http://'+url print url url_list.append(url.replace('https://','http://')) for tit in soup.findAll('div', attrs={'class':'ellip'}): #extract all Titles print tit.text title_list.append(tit.text) for descr in soup.findAll('span', attrs={'class':'st'}): #extraxt all description print descr.text description_list.append(descr.text) print title_list
def is_url(name): try: result = urlparse.urlparse(name) except Exception: return False else: return result.scheme in ('http', 'https', 'file', 'ftp')
def __call__(self, **kwargs): field = self.context.getField('dataTitle') if field: self.info['source']['title'] = field.getAccessor(self.context)() field = self.context.getField('dataLink') if field: self.info['source']['url'] = field.getAccessor(self.context)() field = self.context.getField('dataOwner') if field: vocab = field.Vocabulary(self.context) url = field.getAccessor(self.context)() title = self.context.displayValue(vocab, url) self.info['owner']['title'] = title parser = urlparse.urlparse(url) if all((parser.scheme, parser.netloc)): self.info['owner']['url'] = url else: self.info['owner']['url'] = self.info['source']['url'] if self.info['source']['title'] or self.info['source']['url']: return self.info return self.fallback()
def get_url_list(self, xmlrpc_uri): """ Create a list of urls consisting of the available IPA servers. """ # the configured URL defines what we use for the discovered servers (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(xmlrpc_uri) servers = [] name = '_ldap._tcp.%s.' % self.env.domain rs = dnsclient.query(name, dnsclient.DNS_C_IN, dnsclient.DNS_T_SRV) for r in rs: if r.dns_type == dnsclient.DNS_T_SRV: rsrv = r.rdata.server.rstrip('.') servers.append('https://%s%s' % (ipautil.format_netloc(rsrv), path)) servers = list(set(servers)) # the list/set conversion won't preserve order so stick in the # local config file version here. cfg_server = xmlrpc_uri if cfg_server in servers: # make sure the configured master server is there just once and # it is the first one servers.remove(cfg_server) servers.insert(0, cfg_server) else: servers.insert(0, cfg_server) return servers
def get_lists(self, no_cache=False): """ Returns the rlists for icecast """ from xlgui.panel import radio if no_cache or not self.data: set_status(_('Contacting Icecast server...')) hostinfo = urlparse.urlparse(self.genre_url) try: c = httplib.HTTPConnection(hostinfo.netloc, timeout=20) except TypeError: # python 2.5 doesnt have timeout= c = httplib.HTTPConnection(hostinfo.netloc) try: c.request('GET', hostinfo.path, headers={'User-Agent': self.user_agent}) response = c.getresponse() except (socket.timeout, socket.error): raise radio.RadioException( _('Error connecting to Icecast server.')) if response.status != 200: raise radio.RadioException( _('Error connecting to Icecast server.')) body = response.read() c.close() set_status('') data = {} dom = minidom.parseString(body) divs = dom.getElementsByTagName('div') for div in divs: if div.getAttribute('id') == 'content': anchors = div.getElementsByTagName('a') for anchor in anchors: anchor.normalize() for node in anchor.childNodes: if node.nodeType == minidom.Node.TEXT_NODE: data[node.nodeValue] = anchor.getAttribute( 'href') break break self.data = data self._save_cache() else: data = self.data rlists = [] for item in data.keys(): rlist = RadioList(item, station=self) rlist.get_items = lambda no_cache, name=item: \ self._get_subrlists(name=name, no_cache=no_cache) rlists.append(rlist) sort_list = [(item.name, item) for item in rlists] sort_list.sort() rlists = [item[1] for item in sort_list] self.rlists = rlists return rlists
def generate_cookie(self, url_path, session_id, expiration=None, add_header=False): ''' Return a session cookie containing the session id. The cookie will be contrainted to the url path, defined for use with HTTP only, and only returned on secure connections (SSL). :parameters: url_path The cookie will be returned in a request if it begins with this url path. session_id The session id identified by the session cookie add_header If true format cookie string with Set-Cookie: header :returns: cookie string ''' if not expiration: # Catch zero unix timestamps expiration = None; cookie = Cookie(self.session_cookie_name, session_id, domain=urlparse.urlparse(api.env.xmlrpc_uri).netloc, path=url_path, httponly=True, secure=True, expires=expiration) if add_header: result = 'Set-Cookie: %s' % cookie else: result = str(cookie) return result
def write(self, url, start_time, end_time, ok, reason): ''' @param url: 这个url必须是完整的http请求地址如: http://127.0.0.1:8080/xxx/?sdfa=fas @param ok: boolean -> True or False @param reason: 错误原因,字符串,不能包含英文逗号',' ''' process_time = int((end_time - start_time) * 1000) # 毫秒 urlps = urlparse.urlparse(url) host = '%s%s' % (urlps.hostname, (':%s' % urlps.port) if urlps.port else '') url = urlps.path or '/' for iu in self.ignore_url: if url.startswith(iu): return ok = ok and 1 or 0 # and 'true' or 'false' if self.log_format == 'comma': msg = '%s,%s,%s,%s,%s,%s,%s,%s' % ( self.server_name, self.project_name, host, url, process_time, ok, int(start_time), reason) elif self.log_format == 'json': msg = { 'server_name': self.server_name, 'project_name': self.project_name, 'host': host, 'url': url, 'create_time': int(start_time), 'process_time': process_time, 'ok': ok, 'reason': reason } msg = simplejson.dumps(msg) self.write_log(msg)
def __addPosts(self,link): '''It will add the post ''' try: self.currenturi = link if checkSessionInfo('review', self.session_info_out, self.currenturi, \ self.task.instance_data.get('update')): log.info(self.log_msg('Session info returns True for uri %s'\ %self.currenturi)) return False self.__setSoupForCurrentUri() page = self.__getData() if not page: return True result = updateSessionInfo('review', self.session_info_out, self.currenturi,get_hash( page ),'review', self.task.instance_data.get('update')) if result['updated']: page['path'] = [ self.currenturi] page['parent_path'] = [] page['uri']= self.currenturi page['uri_domain'] = urlparse.urlparse(page['uri'])[1] page['entity'] = 'review' page.update(self.__task_elements_dict) self.pages.append(page) log.info(self.log_msg('Page added')) else: log.info(self.log_msg('Update session info returns False for \ url %s'%self.currenturi)) except: log.exception(self.log_msg('Cannot add the post for the uri %s'\ %self.currenturi)) return False
def open_source(url, decode=True): parsed = urlparse.urlparse(url) if parsed.scheme and len(parsed.scheme)>0 and parsed.scheme[0] == "z": import zcom return zcom.Connection(url, codec=decode).items() else: return sharditerator(url, decode=decode, source=url)
def gopen(url, mode="rb"): """Open the given URL. Supports unusual schemes and uses subprocesses.""" parsed = urlparse.urlparse(url) if parsed.scheme == "gs": if mode[0]=="r": return os.popen("gsutil cat '%s'" % url, "rb") elif mode[0]=="w": return os.popen("gsutil cp - '%s'" % url, "wb") else: raise ValueError("{}: unknown mode".format(mode)) elif parsed.scheme in "http https ftp".split(): if mode[0]=="r": cmd = "curl --fail -s '%s'" % url return os.popen(cmd, "rb") elif mode[0]=="w": test_curl_write(url) cmd = "curl --fail -s -T - '%s'" % url return os.popen(cmd, "wb") else: raise ValueError("{}: unknown mode".format(mode)) elif parsed.scheme in ["", "file"]: if mode[0]=="r": return open(parsed.path, "rb") elif mode[0]=="w": return open(parsed.path, "wb") else: raise ValueError("{}: unknown mode".format(mode))
def is_file(name): try: result = urlparse.urlparse(name) except Exception: return False else: return result.scheme == 'file'
def __addPosts(self, links, parent_list): """Given a list of links to the discussion post, fetch the post contents and the author info """ h = HTTPConnection() for link in links: try: page = {} object_id = re.search('objectID=(\d+)', link).group(1) link = "http://communities.vmware.com/message/%s#%s" %(object_id, object_id) # Using the redirected url instead of the url given by the search page self.currenturi = link page['uri'] = normalize(link) log.debug(self.log_msg("Fetching the post url %s" %(self.currenturi))) if checkSessionInfo(self.genre, self.session_info_out, self.currenturi, self.task.instance_data.get('update'), parent_list=parent_list): # No need to pick this page continue res = self._getHTML() self.rawpage = res['result'] self._setCurrentPage() # First try extracting from the post body if not self.__extractPostBody(page, object_id): # if that fails, extract from the replies self.__extractReplyBody(page, object_id) except: log.exception(self.log_msg("exception in extracting page")) continue page['posted_date'] = datetime.datetime.strftime(page['posted_date'], "%Y-%m-%dT%H:%M:%SZ") checksum = md5.md5(''.join(sorted(page.values())).encode('utf-8','ignore')).hexdigest() id = None if self.session_info_out=={}: id = self.task.id result = updateSessionInfo(self.genre, self.session_info_out, self.currenturi, checksum, 'Post', self.task.instance_data.get('update'), parent_list=parent_list, Id=id) if result['updated']: page['path'] = page['parent_path'] = parent_list page['path'].append(self.currenturi) page['priority']=self.task.priority page['level']=self.task.level page['pickup_date'] = datetime.datetime.strftime(datetime.datetime.utcnow(),"%Y-%m-%dT%H:%M:%SZ") page['connector_instance_log_id'] = self.task.connector_instance_log_id page['connector_instance_id'] = self.task.connector_instance_id page['workspace_id'] = self.task.workspace_id page['client_id'] = self.task.client_id # TODO: Get the client from the project page['client_name'] = self.task.client_name page['last_updated_time'] = page['pickup_date'] page['versioned'] = False page['entity'] = 'Review' page['category'] = self.task.instance_data.get('category','') page['task_log_id']=self.task.id page['uri_domain'] = urlparse.urlparse(page['uri'])[1] # Calculate the hash and get the session info thingy self.pages.append(page) return True
def fetch_photos_from_msg(self, album, msg=None): u = album.user token = get_access_token(u) graph = facebook.GraphAPI(token) if msg.status == 'awaiting': parts = urlparse.urlparse(msg.next_page) qs = urlparse.parse_qs(parts.query) after = qs.get('after')[0] photos = graph.get_object(album.fb_album_id + "/photos", fields='id,source', limit=2, after=after) new_next_page = photos.get('paging').get('next') new_msg = Message.objects.create(next_page=new_next_page, user=u, status='awaiting') for photo in photos.get('data'): img_temp = NamedTemporaryFile(delete=True) img_temp.write(urlopen(photo.get('source')).read()) img_temp.flush() photo_object = Photo.objects.create(title=photo.get('id'), description=photo.get('created_time'), album=album, file=File(img_temp)) pprint(photo_object.filename) self.stdout.write('Successfully fetched photo for source "%s"\n' % photo.get('source')) msg.status = 'done' msg.save() self.stdout.write('Finished this queue "%s"\n' % new_msg.next_page)
def __setParentPage(self): """This will get the parent info """ page = {} try: page['et_thread_hierarchy'] = self.__hierarchy = [x.strip() for x in stripHtml(self.soup.find('div', 'deck breadcrumbs').renderContents()).split('>') if x.strip()][1:] page['data'] = page['title'] = page['et_thread_hierarchy'][-1] except: log.exception(self.log_msg('Thread hierarchy and Title Not found for uri\ %s'%self.currenturi)) return if checkSessionInfo(self.__genre, self.session_info_out, self.task.instance_data['uri'], \ self.task.instance_data.get('update')): log.info(self.log_msg('Session info return True, Already exists')) return try: result = updateSessionInfo('review', self.session_info_out, self.\ task.instance_data['uri'], get_hash( page ), 'forum', self.task.instance_data.get('update')) if result['updated']: page['path'] = [self.task.instance_data['uri']] page['parent_path'] = [] page['uri'] = self.currenturi page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1]) page['data'] = '' page['entity'] = 'thread' page.update(self.__task_elements_dict) page['posted_date'] = page['pickup_date'] self.pages.append(page) log.info(self.log_msg('Parent Page Added')) else: log.info(self.log_msg('Result[updated] returned True for \ uri'%self.currenturi)) except: log.exception(self.log_msg("parent post couldn't be parsed"))
def write(self, url, start_time, end_time, ok, reason): ''' @param url: 这个url必须是完整的http请求地址如: http://127.0.0.1:8080/xxx/?sdfa=fas @param ok: boolean -> True or False @param reason: 错误原因,字符串,不能包含英文逗号',' ''' process_time = int((end_time - start_time) * 1000) # 毫秒 urlps = urlparse.urlparse(url) host = '%s%s' %(urlps.hostname, (':%s' %urlps.port) if urlps.port else '') url = urlps.path or '/' for iu in self.ignore_url: if url.startswith(iu): return ok = ok and 1 or 0 # and 'true' or 'false' if self.log_format == 'comma': msg = '%s,%s,%s,%s,%s,%s,%s,%s' %(self.server_name, self.project_name, host, url, process_time, ok, int(start_time), reason) elif self.log_format == 'json': msg = { 'server_name': self.server_name, 'project_name': self.project_name, 'host': host, 'url': url, 'create_time': int(start_time), 'process_time': process_time, 'ok': ok, 'reason': reason} msg = simplejson.dumps(msg) self.write_log(msg)
def __getParentPage(self,comment): """This will get the parent info """ page = {} try: self.__total_replies_count = page['ei_data_replies_count'] = int(stripHtml(comment.find('totalreplies').renderContents())) page['title'] = page['data'] = stripHtml(comment.find('name').renderContents()) page['posted_date'] = stripHtml(comment.find('dateadded').renderContents()).split('.')[0] unique_key = stripHtml(comment.find('messageid').renderContents()) if checkSessionInfo(self.__genre, self.session_info_out, self.task.instance_data['uri'],\ self.task.instance_data.get('update')): log.info(self.log_msg('Session info return True, Already exists')) return result = updateSessionInfo('review', self.session_info_out, self.\ task.instance_data['uri'], get_hash( page ), 'forum', self.task.instance_data.get('update')) if result['updated']: page['path']=[unique_key] page['parent_path']=[] page['uri'] = self.currenturi page['uri_domain'] = unicode(urlparse.urlparse(page['uri'])[1]) page['entity'] = 'post' page.update(self.__task_elements_dict) log.info(page['data']) self.pages.append(page) else: log.info(self.log_msg('Result[updated] returned True for \ uri'%self.currenturi)) except: log.exception(self.log_msg('Hierachy/Title not found in url %s'%self.currenturi)) return
def __addPost(self, post, is_question=False): try: unique_key = re.search(r'(\d+)', post['id']).groups()[0] if checkSessionInfo(self.__genre, self.session_info_out, unique_key, \ self.task.instance_data.get('update'),parent_list\ = [self.task.instance_data['uri']]): log.info(self.log_msg('Session info returns True for %s'%unique_key)) return False page = self.__getData(post, is_question) if not page: log.info(self.log_msg('page contains empty data, getdata \ returns False for uri %s'%self.currenturi)) return True result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \ get_hash( page ),'forum', self.task.instance_data.get('update'),\ parent_list=[self.task.instance_data['uri']]) if result['updated']: page['parent_path'] = [self.task.instance_data['uri']] page['path'] = [ self.task.instance_data['uri'], unique_key] page['uri'] = self.__baseuri + 'showpost.php?p=' + unique_key page['uri_domain'] = urlparse.urlparse(page['uri'])[1] page.update(self.__task_elements_dict) self.pages.append(page) else: log.info(self.log_msg('Update session info returns False for \ url %s'%self.currenturi)) except: log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi)) return True
def __call__(self, **kwargs): field = self.context.getField('provenances') provenances = field.getAccessor(self.context)() formatted_provenances = [] for provenance in provenances: title = provenance.get('title', '') link = provenance.get('link', '') owner = provenance.get('owner', '') if title != '' or owner != '' or link != '': formatted_provenance = {'source':{}, 'owner':{}} formatted_provenance['source']['title'] = title formatted_provenance['source']['url'] = link if owner != '': if hasVocab: owner_title = tmpOrganisationsVocabulary.\ getDisplayList(self.context).getValue(owner) else: owner_title = owner formatted_provenance['owner']['title'] = owner_title parser = urlparse.urlparse(owner) if all((parser.scheme, parser.netloc)): formatted_provenance['owner']['url'] = owner else: formatted_provenance['owner']['url'] = link formatted_provenances.append(formatted_provenance) self.info['provenances'] = formatted_provenances return self.info
def get_document(self, url): """ Connects to the server and retrieves the document """ set_status(_('Contacting SomaFM server...')) hostinfo = urlparse.urlparse(url) try: c = httplib.HTTPConnection(hostinfo.netloc, timeout = 20) except TypeError: c = httplib.HTTPConnection(hostinfo.netloc) try: c.request('GET', hostinfo.path, headers={'User-Agent': self.user_agent}) response = c.getresponse() except (socket.timeout, socket.error): raise radio.RadioException(_('Error connecting to SomaFM server.')) if response.status != 200: raise radio.RadioException(_('Error connecting to SomaFM server.')) document = response.read() c.close() set_status('') return document
def html2rest(html, writer=sys.stdout, encoding='utf8', relto=None, preprocess=None, wrap_width=80, nowrap=False, embedded_uri=False): relroot = relpath = None if relto: parsed = urlparse.urlparse(relto) relroot = parsed.scheme + '://' + parsed.netloc relpath = relroot + parsed.path if relpath[-1] != '/': relpath += '/' if preprocess: html = preprocess(html, encoding=encoding) parser = Parser(writer, encoding, relroot, relpath, wrap_width=wrap_width, nowrap=nowrap, embedded_uri=embedded_uri) #parser.feed(readsoup(html)) parser.feed(html.decode(encoding)) parser.close()
def __init__(self, parserobots=False, output=None, report=False ,domain="", exclude=[], skipext=[], drop=[], debug=False): self.parserobots = parserobots self.output = output self.report = report self.domain = domain self.exclude = exclude self.skipext = skipext self.drop = drop self.debug = debug if self.debug: logging.basicConfig(level=logging.DEBUG) self.tocrawl = set([domain]) try: self.target_domain = urlparse.urlparse(domain)[1] except: raise ValueError("Invalid domain") if self.output: try: self.output_file = open(self.output, 'w') except: logging.debug ("Output file not available.") exit(255)
def check_config(): """ Check crucial configuration details for existence and workability. Runs checks to see whether bugtracker's URL is reachable, whether backend is available at the right filename, and whether the script has the key arguments it needs to run: URL, backend, and database details. The filename for the backend in the backends/ directory needs to be the same as the configuration argument specifying that backend. For instance, invoking the Launchpad backend uses 'lp', and so the filename is 'lp.py'. """ Config.check_params(['url', 'backend']) if Config.backend + ".py" not in Backend.get_all_backends(): raise InvalidConfig('Backend "' + Config.backend + '" does not exist') url = urlparse.urlparse(Config.url) check_url = urlparse.urljoin(url.scheme + '://' + url.netloc, '') print("Checking URL: " + check_url) req = Request(check_url) try: response = urlopen(req) except HTTPError, e: raise InvalidConfig('The server could not fulfill the request ' + str(e.msg) + '(' + str(e.code) + ')')
def __addPost(self, post, is_question = False): """ This will take the post tag , and fetch data and meta data and add it to self.pages """ try: unique_key_tag = post.find('a', id=re.compile('postcount\d+')) #unique_key = self.__removeSessionId('http://htcpedia.com/forum/' + unique_key_tag['href']) unique_key = unique_key_tag['id'] log.info(unique_key) if checkSessionInfo(self.__genre, self.session_info_out, unique_key,\ self.task.instance_data.get('update')): log.info(self.log_msg('Session info returns True for uri %s'%unique_key)) return False page = self.__getData(post, is_question) if not page: log.info(self.log_msg('page contains empty data, getdata \ returns False for uri %s'%self.currenturi)) return True result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \ get_hash( page ),'forum', self.task.instance_data.get('update')) if result['updated']: page['parent_path'] = [] page['path'] = [self.task.instance_data['uri'], unique_key] page['uri'] = self.currenturi + "#" + unique_key page['uri_domain'] = urlparse.urlparse(page['uri'])[1] log.info(page) page.update(self.__task_elements_dict) self.pages.append(page) else: log.info(self.log_msg('Update session info returns False for \ url %s'%self.currenturi)) except: log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi)) return True
def process_waze_message(token): connection=httplib.HTTPConnection("waze.to") connection.request("GET","/"+token) response=connection.getresponse() meetingUrl=response.getheader("location") logging.debug(meetingUrl) querystring=urlparse.urlparse(meetingUrl).query token= urlparse.parse_qs(querystring)["token"][0] logging.debug(token) meetingInfoUrl="http://mobile-web.waze.com/SocialMediaServer/internal/getMeetingInfo?event_id="+ token logging.debug(meetingInfoUrl) meetingInfo=json.loads(urllib2.urlopen(meetingInfoUrl).read()) logging.debug(meetingInfo) driveUrl="http://mobile-web.waze.com/rtserver/web/PickUpGetDriverInfo?clientID=70a8b694c7&routeTimestamp=0&getUserInfo=true&token=" + token logging.debug(driveUrl) driverInfo=json.loads(urllib2.urlopen(driveUrl).read()) logging.debug(driverInfo) if driverInfo['status']=='ok': eta = driverInfo['eta'] insert_waze_request(token,eta) else: deactivate_waze_request(token)
def clean(self, value): if not isinstance(value, basestring): self.raise_config_error("is not a URL string.") # URLs must be bytes, not unicode. if isinstance(value, unicode): value = value.encode('utf-8') return urlparse.urlparse(value)
def __addPost(self, post): '''It will add the post ''' try: page = self.__getData(post) if not page: return True unique_key = get_hash( {'data' : page['data'] }) if checkSessionInfo('review', self.session_info_out, unique_key,\ self.task.instance_data.get('update'),parent_list\ = [self.currenturi]): log.info(self.log_msg('Session info returns True')) return False result=updateSessionInfo('review', self.session_info_out, unique_key, \ get_hash( page ),'Review', self.task.instance_data.get('update'),\ parent_list=[self.currenturi]) if not result['updated']: log.info(self.log_msg('Update session info returns False')) return True page['path'] = [self.currenturi] page['parent_path'] = [] #page['path'].append(unique_key) page['uri'] = self.currenturi page['uri_domain'] = urlparse.urlparse(page['uri'])[1] page['entity'] = 'post' page.update(self.__task_elements_dict) self.pages.append(page) log.info(page) log.info(self.log_msg('Post Added')) return True except: log.exception(self.log_msg('Error while adding session info')) return False
def check_config(): """ Check crucial configuration details for existence and workability. Runs checks to see whether bugtracker's URL is reachable, whether backend is available at the right filename, and whether the script has the key arguments it needs to run: URL, backend, and database details. The filename for the backend in the backends/ directory needs to be the same as the configuration argument specifying that backend. For instance, invoking the Launchpad backend uses 'lp', and so the filename is 'lp.py'. """ Config.check_params(['url', 'backend']) if Config.backend + ".py" not in Backend.get_all_backends(): raise InvalidConfig('Backend "' + Config.backend + '" does not exist') url = urlparse.urlparse(Config.url) check_url = urlparse.urljoin(url.scheme + '://' + url.netloc, '') print("Checking URL: " + check_url) req = Request(check_url) if Config.backend != 'github': try: response = urlopen(req) except HTTPError, e: raise InvalidConfig('The server could not fulfill the request ' + str(e.msg) + '(' + str(e.code) + ')') except URLError, e: raise InvalidConfig('We failed to reach a server. ' + str(e.reason))
def __addPost(self, post): """ This will take the post tag , and fetch data and meta data and add it to self.pages """ try: page = self.__getData(post) if not page: log.info(self.log_msg('page contains empty data, getdata \ returns False for uri %s'%self.currenturi)) return True unique_key = get_hash(page) if checkSessionInfo(self.__genre, self.session_info_out, unique_key,\ self.task.instance_data.get('update')): log.info(self.log_msg('Session info returns True for uri %s'%unique_key)) return False result = updateSessionInfo(self.__genre, self.session_info_out, unique_key, \ get_hash( page ),'forum', self.task.instance_data.get('update')) if result['updated']: page['parent_path'] = [] page['path'] = [unique_key] page['uri'] = self.currenturi page['uri_domain'] = urlparse.urlparse(page['uri'])[1] log.info(page) page.update(self.__task_elements_dict) self.pages.append(page) else: log.info(self.log_msg('Update session info returns False for \ url %s'%self.currenturi)) except: log.exception(self.log_msg('Cannot add the post for the uri %s'%self.currenturi)) return True
def test_compare_triples(): for mime, fext in MIME_TYPES.items(): dump_path = path.join(DUMP_DIR, path.basename(mime)) for url in URLs: if six.PY2: fname = '%s.%s' % (path.basename( urlparse.urlparse(url).path), fext) else: fname = '%s.%s' % (path.basename(urlparse(url).path), fext) fname = path.join(dump_path, fname) req = Request(url) req.add_header('Accept', mime) res = urlopen(req) g_fdp.parse(data=res.read(), format=mime) g_dump.parse(fname, format=mime) both, first, second = graph_diff(g_fdp, g_dump) n_first = len(first) # n_second = len(second) # n_both = len(both) assert_equals( n_first, 0, '{} triple(s) different from reference:\n\n{}===\n{}\n'.format( n_first, first.serialize(format='turtle'), second.serialize(format='turtle')))
def __call__(self, text="", match=""): match = match.replace('https://', 'http://') soup = BeautifulSoup(text) items = set() for link in soup.find_all('a'): href = link.get('href') if href is None: continue href = href.replace('https://', 'http://') ourl = urlparse.urlparse(href) ourl = ourl._replace(fragment='', query='', params='') href = ourl.geturl() if u'resolveuid' in href: items.add(href) elif href.startswith('../'): items.add(href) elif match and href.startswith(match): found = href[len(match):] found = found.strip('/') items.add(found) elif href.startswith('/'): href = href.strip('/') items.add(href) for link in items: yield { 'count': 1, 'type': 'Link', 'text': link, 'relevance': '100.0' }
def check_link(url): ''' Checks a link whose URL starts with 'http'. Ignores links that start with: * https://forums.aws.amazon.com because you have to be signed in to the forum for the link to be valid. Uses urllib2 to parse the URL and check that it is valid. @returns True if the link is valid, False otherwise. ''' logger.debug("Checking {}".format(url)) if re.match(r'https://forums\.aws\.amazon\.com/', url): return True try: if not urlparse.urlparse(url).netloc: return False website = urlopen(url) html = website.read() if website.code != 200: return False except Exception, e: logger.exception("") return False
def test_compare_triples(): for mime, fext in MIME_TYPES.items(): dump_path = path.join(DUMP_DIR, path.basename(mime)) for url in URLs: if six.PY2: fname = '%s.%s' % (path.basename(urlparse.urlparse(url).path), fext) else: fname = '%s.%s' % (path.basename(urlparse(url).path), fext) fname = path.join(dump_path, fname) req = Request(url) req.add_header('Accept', mime) res = urlopen(req) g_fdp.parse(data=res.read(), format=mime) g_dump.parse(fname, format=mime) both, first, second = graph_diff(g_fdp, g_dump) n_first = len(first) # n_second = len(second) # n_both = len(both) assert_equals( n_first, 0, '{} triple(s) different from reference:\n\n{}===\n{}\n'.format( n_first, first.serialize(format='turtle'), second.serialize(format='turtle')))
def cookies(self): """ Cookies """ if self._cookies is None: ac_cookie = self.request.cookies.get('__ac', None) if not ac_cookie: self._cookies = {} return self._cookies ## XXX There is a bug with wkhtmltopdf --cookie param ## Thus we'll use --cookie-jar ## EEA ticket #21958. wkhtmltopdf tickets #1870, #1903 url = urlparse.urlparse(self.context.absolute_url()) domain = url.hostname # Also allow CDN resources if domain.startswith(u"www."): domain = domain.replace(u"www.", u".", 1) cookie = u"__ac={cookie}; domain={domain}; path=/;".format( cookie=ac_cookie, domain=domain ) with tempfile.NamedTemporaryFile( prefix='eea.converter.', suffix='.cookie.jar', dir=TMPDIR(), delete=False) as ofile: ofile.write(cookie) self._cookies = ofile.name return self._cookies
def cookies(self): """ Cookies """ if self._cookies is None: ac_cookie = self.request.cookies.get('__ac', None) if not ac_cookie: self._cookies = {} return self._cookies ## There is a bug with wkhtmltopdf --cookie param ## Thus we'll use --cookie-jar ## EEA ticket #21958. wkhtmltopdf tickets #1870, #1903 url = urlparse.urlparse(self.context.absolute_url()) domain = url.hostname # Also allow CDN resources if domain.startswith(u"www."): domain = domain.replace(u"www.", u".", 1) cookie = u"__ac={cookie}; domain={domain}; path=/;".format( cookie=ac_cookie, domain=domain) with tempfile.NamedTemporaryFile(prefix='eea.converter.', suffix='.cookie.jar', dir=TMPDIR(), delete=False) as ofile: ofile.write(cookie) self._cookies = ofile.name return self._cookies
def test_compare_triple_counts(): for mime, fext in MIME_TYPES.items(): dump_path = path.join(DUMP_DIR, path.basename(mime)) for url in URLs: if six.PY2: fname = '%s.%s' % (path.basename( urlparse.urlparse(url).path), fext) else: fname = '%s.%s' % (path.basename(urlparse(url).path), fext) fname = path.join(dump_path, fname) req = Request(url) req.add_header('Accept', mime) res = urlopen(req) g_fdp.parse(data=res.read(), format=mime) g_dump.parse(fname, format=mime) # triple counts nt_fdp, nt_dump = len(g_fdp), len(g_dump) assert_equals( nt_fdp, nt_dump, 'Triple counts differ: %d (FDP) vs. %d (ref)' % (nt_fdp, nt_dump))
def extract_images(self, soup, tempdir): index = 0 for img in soup.body.findAll('img'): if not img.get('src'): img['src'] = '' continue img['src'] = self.convert_virtual_url(img['src']) width = 0 height = 0 img_data = None if img['src'].startswith('//'): img['src'] = 'http:' + img['src'] # local image elif img['src'].startswith('/'): img_data, width, height = \ self.get_local_image(img['src']) # remote image if img['src'].startswith('http'): _, remote_server, img_path, _, _, _ = urlparse.urlparse( img['src']) try: conn = httplib.HTTPConnection(remote_server) conn.request('GET', img_path) resp = conn.getresponse() if not resp.status == 200: logger.warn('Could not get image {0}: {1} {2}'.format( img['src'], resp.status, resp.reason)) else: img_data = resp.read() except Exception as e: logger.warn('Error getting remote image {0}: {1}'.format( img['src'], e)) conn.close() if not img_data: continue img_orig_id = img['src'].split('/')[-1] img_ext = img_orig_id.split('.')[-1] if img_ext == img_orig_id: img_ext = 'dat' img_id = 'image%d.%s' % (index, img_ext) index += 1 img_file = open(os.path.join(tempdir, img_id), 'wb') img_file.write(img_data) img_file.close() if 'width' not in img or 'height' not in img: try: img_obj = Image.open(img_file.name) width, height = img_obj.size except Exception as e: logger.warn('Could not get image size for {0}: {1}'.format( img['src'], e)) if width and 'width' not in img: img['width'] = width if height and 'height' not in img: img['height'] = height img['src'] = os.path.join('images', img_id) return soup
def make_connection(self, host): self.user_pass, self.realhost = splituser(host) proto, proxy, p1, p2, p3, p4 = urlparse.urlparse( self.proxies.get('http', '')) if proxy and not self.local: return httplib.HTTP(proxy) else: return httplib.HTTP(self.realhost)
def _convert_to_text(link): parsed = urlparse.urlparse(link.url) site = parsed[1] rest = ' '.join(re.split(r'[/.-_]', parsed[2])) data = '%s %s %s user*%s topic:%s %s' % (site, rest, link.text, link.user.username, link.topic.name, link.topic.full_name) data = data.replace("'", "*") data = data.replace("%", "*") return data
def open_sink(url, encode=True): parsed = urlparse.urlparse(url) if parsed.scheme and len(parsed.scheme)>0 and parsed.scheme[0] == "z": import zcom return zcom.Connection(url, codec=encode) else: stream = gopen(url, "wb") return tarrecords.TarWriter(stream, encode=encode)
def main(): #argument parser parser = argparse.ArgumentParser() parser.add_argument('-t', help="The host to scrape", action='store', dest='host', required=True) parser.add_argument('-o', help="Output file for email addresses", action='store', dest='outfile') parser.add_argument('-n', help="Number of email addresses to collect", action='store', dest='ncollect', default=20) parser.add_argument('-v', help="verbose output", action='store_true', dest='verbose') parser.add_argument('-w', help="wait time between requests", action='store', dest='wait') args = parser.parse_args() #signal setup with globals signal(SIGINT, sig_handle) if args.ncollect is None: args.ncollect = 20 if args.wait is None: args.wait = 0 #setup output file name if not args.outfile is None: outfile = args.outfile else: outfile = "{}.emails.txt".format(urlparse.urlparse(args.host).netloc) emails = [] #check verbose if args.verbose: global verbose verbose = True #scrape, if the first scrape fails, the link is invalid if (scrape(args.host, int(args.ncollect), [], emails, int(args.wait), True)): with open(outfile, "a") as file: #save emails when finished for email in emails: file.write(email) file.write("\n") else: print("Link read failed: {}".format(args.host)) return 0
def _populate_hosts_and_request_paths(self): """ Rackspace uses a separate host for API calls which is only provided after an initial authentication request. If we haven't made that request yet, do it here. Otherwise, just return the management host. """ if not self.auth_token: # Initial connection used for authentication conn = self.conn_classes[self.secure](self.auth_host, self.port[self.secure]) conn.request(method='GET', url='/%s' % (AUTH_API_VERSION), headers={ 'X-Auth-User': self.user_id, 'X-Auth-Key': self.key }) resp = conn.getresponse() if resp.status == httplib.NO_CONTENT: # HTTP NO CONTENT (204): auth successful headers = dict(resp.getheaders()) try: self.server_url = headers['x-server-management-url'] self.storage_url = headers['x-storage-url'] self.cdn_management_url = headers['x-cdn-management-url'] self.lb_url = self.server_url.replace( "servers", "ord.loadbalancers") self.auth_token = headers['x-auth-token'] except KeyError, e: # Returned 204 but has missing information in the header, something is wrong raise MalformedResponseError('Malformed response', body='Missing header: %s' % (str(e)), driver=self.driver) elif resp.status == httplib.UNAUTHORIZED: # HTTP UNAUTHORIZED (401): auth failed raise InvalidCredsError() else: # Any response code != 401 or 204, something is wrong raise MalformedResponseError( 'Malformed response', body='code: %s body:%s' % (resp.status, ''.join(resp.body.readlines())), driver=self.driver) for key in [ 'server_url', 'storage_url', 'cdn_management_url', 'lb_url' ]: scheme, server, request_path, param, query, fragment = ( urlparse.urlparse(getattr(self, key))) # Set host to where we want to make further requests to setattr(self, '__%s' % (key), server) setattr(self, '__request_path_%s' % (key), request_path) conn.close()