def fill_file(result): url = result #print url f = get_page(url) #print f if f == "": return "a" f = f.read() ##removes tags f = html_to_text(f) ##decoding to ASCII encoding = 'utf-8' try: ustr = f.decode(encoding) except: pass return f b = StringIO() old = sys.stdout try: sys.stdout = b html2text.wrapwrite(html2text.html2text(ustr, url)) finally: sys.stdout = old text = b.getvalue() b.close() return text
def prep(dataset): "Retrieve information from CMS ReqMgr data-service" dsn = dataset.split('/')[1] purl = 'http://cms.cern.ch/iCMS/jsp/mcprod/admin/requestmanagement.jsp' args = {'dsn': dsn, 'campid': 'any'} sso = 'https://cms.cern.ch/test/env.cgi?url=' url = sso + purl + '?' + urllib.urlencode(args) cert = os.path.join(os.environ['HOME'], '.globus/usercert.pem') data = '' # print "url", url with working_pem(PEMMGR.pem) as key: data = get_data_sso(url, key, cert).read() params_dict, action = parse_sso_output(data) params = urllib.urlencode((params_dict)) # print "params", params # print "action", action if action: opener = create_https_opener(key, cert) fdesc = opener.open(action, params) data = fdesc.read() for row in data.split('\n'): if row.find('setCookie') != -1: ctup = row.split('(')[-1].replace('"', '').replace( "'", '').split(',')[:2] # print "key/val", ctup for hdl in opener.handlers: if repr(hdl).find('urllib2.HTTPCookieProcessor') != -1: for ccc in hdl.__dict__['cookiejar']: cookie = cookielib.Cookie(\ port=None, port_specified=False, domain=ccc.domain, domain_specified=False, domain_initial_dot=False, path=ccc.path, path_specified=False, secure=None, expires=None, discard=True, comment=None, comment_url=None, rest=None, version=0, name=ctup[0], value=ctup[1]) hdl.__dict__['cookiejar'].set_cookie(cookie) break print hdl.__dict__['cookiejar'] # print "\n### data", '\n'.join([r for r in data.split() if r]) fdesc = opener.open(purl + '?' + urllib.urlencode(args)) data = fdesc.read() # print "\n### data", data wrapwrite(html2text(data, ''))
def prep(dataset): "Retrieve information from CMS ReqMgr data-service" dsn = dataset.split('/')[1] purl= 'http://cms.cern.ch/iCMS/jsp/mcprod/admin/requestmanagement.jsp' args = {'dsn': dsn, 'campid':'any'} sso = 'https://cms.cern.ch/test/env.cgi?url=' url = sso + purl + '?' + urllib.urlencode(args) cert = os.path.join(os.environ['HOME'], '.globus/usercert.pem') data = '' # print "url", url with working_pem(PEMMGR.pem) as key: data = get_data_sso(url, key, cert).read() params_dict, action = parse_sso_output(data) params = urllib.urlencode((params_dict)) # print "params", params # print "action", action if action: opener = create_https_opener(key, cert) fdesc = opener.open(action, params) data = fdesc.read() for row in data.split('\n'): if row.find('setCookie') != -1: ctup = row.split('(')[-1].replace('"', '').replace("'", '').split(',')[:2] # print "key/val", ctup for hdl in opener.handlers: if repr(hdl).find('urllib2.HTTPCookieProcessor') != -1: for ccc in hdl.__dict__['cookiejar']: cookie = cookielib.Cookie(\ port=None, port_specified=False, domain=ccc.domain, domain_specified=False, domain_initial_dot=False, path=ccc.path, path_specified=False, secure=None, expires=None, discard=True, comment=None, comment_url=None, rest=None, version=0, name=ctup[0], value=ctup[1]) hdl.__dict__['cookiejar'].set_cookie(cookie) break print hdl.__dict__['cookiejar'] # print "\n### data", '\n'.join([r for r in data.split() if r]) fdesc = opener.open(purl + '?' + urllib.urlencode(args)) data = fdesc.read() # print "\n### data", data wrapwrite(html2text(data, ''))
def read(url, output=None, debug=0): "Get run information from RunSummary data-service" encoding = 'utf-8' key = None cert = os.path.join(os.environ['HOME'], '.globus/usercert.pem') if os.path.isfile(url): with open(url, 'r') as stream: context = stream.read() try: pydoc.pager(context) except: print context return elif url.find('cmsweb.cern.ch') != -1: data = get_data(url, decoder=None) html = data encoding = None elif url.find('mcdb.cern.ch') != -1: data = urllib.urlopen(url) html = data.read().replace(' _place_holder;', '') encoding = enc(data.headers, html)[0] elif url.find('cern.ch') == -1: data = urllib.urlopen(url) html = data.read() encoding = enc(data.headers, html)[0] else: with working_pem(PEMMGR.pem) as key: data = get_data_sso(url, key, cert, debug) html = data.read() encoding = enc(data.headers, html)[0] if encoding == 'us-ascii': encoding = 'utf-8' pager = os.environ.get('CMSSH_PAGER', None) if html: if int(os.environ.get('HTTPDEBUG', 0)): print_info('read data') print html if encoding: text = html.decode(encoding) res = html2text(text, '') if output: with open(output, 'w') as stream: stream.write(html) else: try: if pager: pydoc.pager(res.encode('utf-8')) else: wrapwrite(html2text(text, '')) except: wrapwrite(html2text(text, '')) else: if output: with open(output, 'w') as stream: stream.write(html) else: try: if pager: pydoc.pager(html) else: print html except: print html
#!/usr/bin/env python import sys sys.path.append('..') import html2text if __name__ == "__main__": # process input args = sys.argv[1:] if len(args) > 0: file_ = args[0] encoding = None if len(args) == 2: encoding = args[1] if len(args) > 2: p.error('Too many arguments') data = open(file_, 'rb').read() if encoding is None: try: from chardet import detect except ImportError: detect = lambda x: {'encoding': 'utf-8'} encoding = detect(data)['encoding'] data = data.decode(encoding) else: data = sys.stdin.read() html2text.wrapwrite(html2text.html2text(data, ''))