Пример #1
0
    def endElement(self, tag):
        tag, attrs, data = self._history.pop()
        data = data.strip()

        if untranslated._translatable(
                data) and not untranslated._tal_replaced_content(tag, attrs):
            # not enclosed
            if (self._i18nlevel
                    == 0) and tag not in ['script', 'style', 'html']:
                severity = untranslated._severity(tag, attrs) or ''
                if severity:
                    if untranslated.IGNORE_UNTRANSLATED in attrs.keys():
                        # Ignore untranslated data. This is necessary for
                        # including literal content, that does not need to be
                        # translated.
                        pass
                    elif not untranslated.CHAMELEON_SUBST.match(data):
                        h = HTMLParser.HTMLParser()
                        with open(self._filename, 'r') as source_file:
                            bs = BeautifulSoup.BeautifulSoup(
                                source_file, 'html.parser')
                            source_file.close()
                        attr = {}
                        for key in attrs.keys():
                            if key not in ['selected']:
                                attr[key] = attrs.getValue(key)
                        values = bs.findAll(tag.lower(), attrs=attr)
                        if not values:
                            self.log(
                                'i18n:translate missing for this:\n'
                                '"""\n%s\n"""\nTag:<%s> Attrs:%s' %
                                (data.encode('utf8'), tag, attr), severity)
                        for v in values:
                            if not v.has_attr('i18n:translate'):
                                v.name = tag
                                escaper = EntitySubstitution()
                                substitute = copy(v)
                                if v.string:
                                    substitute.string = escaper.substitute_html(
                                        v.string)
                                for i in [v, substitute]:
                                    pattern = h.unescape(str(i))
                                    i['i18n:translate'] = ""
                                    substring = h.unescape(str(i))
                                    match = replace(
                                        self._filename, str(pattern),
                                        str(substring),
                                        self._parser.getLineNumber())
                                    if match:
                                        break
                                if not match:
                                    self.log(
                                        'i18n:translate missing for this:\n'
                                        '"""\n%s\n"""\nPattern: %s' %
                                        (data.encode('utf8'), str(pattern)),
                                        severity)
        if self._i18nlevel != 0:
            self._i18nlevel -= 1
Пример #2
0
def parse_warning_page(page_source, warning):
    """
    Pare source code of a warning HTML page and extract key, name and description.
    """
    # parse HTML page
    soup = BeautifulSoup(page_source, 'html.parser')
    content = soup.find('main')

    # use header, sometimes only message ID
    key = warning['key']
    warning['name'] = name(content.find('h1'), key, key)
    # sometimes better description inside blockquote
    warning['name'] = name(content.select_one('blockquote > p'), key, warning['name'])

    desc = ''
    for paragraph in  content.select('main > p'):
        txt = str(paragraph)
        if 'Compiler Warning ' in warning['name']:
            # compiler messages: first p element is header
            if len(txt) < 200:
                warning['name'] = name(paragraph, key, warning['name'])
            else:
                desc += txt
                break
        else:
            # use only first p block: XML otherwise becomes too large
            desc += txt
            break
    if not desc:
        # repeat header in description to have something
        desc = '<p>'  + EntitySubstitution().substitute_html(warning['name']) + '</p>'
    warning['description'] = desc
    return warning
Пример #3
0
def xml_escape(string):
    """ formatter function for writing to file """
    # it seems to be a problem form bs to escape stuff and selfclosing tags
    ret = EntitySubstitution.substitute_xml(string)
    ret = ret.replace('"', '&quot;')
    ret = ret.replace("'", '&apos;')
    return ret
Пример #4
0
	def make_opening_tag(self, tag_node, self_closing=False):
		components = [tag_node.name]
		for attr, value in tag_node.attrs.iteritems():
			if isinstance(value, list):
				value = u" ".join(value)
			components.append(u'%s="%s"'%(attr, EntitySubstitution.substitute_html(value)))
		start = u"<"
		end = u"/>" if self_closing else u">"
		return start +  u" ".join(components) + end
Пример #5
0
def save_sent_viz_file(x, name, scores, k, args):
	escaper = EntitySubstitution()

	with open(os.path.join(args.outdir, 'id_to_word.pkl'),'rb') as f:
		id_to_word = pickle.load(f)
	new_data = list()
	new_texts = list()
	with open(os.path.join(args.outdir, 'sent_viz_L2X'+name+'.html'), 'w') as txt_file:
		txt_file.write(u"<!DOCTYPE html>\n<html>\n<body>\n".encode("utf-8"))

		for i, x_single in enumerate(x):
			x_selected = get_selected_words(x_single,
				scores[i], id_to_word, k)

			# new_data.append(x_selected)
			for s_i, s in enumerate(x_single):
				if s != 0:
					break

			# txt_file.write( (u" ".join([id_to_word[i] for i in x_single[s_i:] if i != 0]) + u"\n").encode("utf-8") )

			sent_viz = list()
			for wp, wi in enumerate(x_single[s_i:]):
				# if x_selected[s_i:][wp] != 0:
				# 	placeholder = u"-" * len(id_to_word[wi])
				# else:
				# 	placeholder = u" " * len(id_to_word[wi])
				if x_selected[s_i:][wp] != 0:
					placeholder = u"<mark><strong>" + escaper.substitute_html(id_to_word[wi]) + u"</strong></mark>"
				else:
					placeholder = escaper.substitute_html(id_to_word[wi])

				sent_viz.append(placeholder)

			txt_file.write((u"<p>" + u" ".join(sent_viz) + u"</p><br>\n").encode("utf-8"))
		txt_file.write(u"</body>\n</html>\n".encode("utf-8"))
Пример #6
0
	def extract_content(self):
		soup = BeautifulSoup(self.html, from_encoding="utf-8")
		content = soup.find("div", {"id" : "contenu"})
		#print dir(content)
		#print help(content)
		for x in dir(content):
			print "\t", x.title() ,type(getattr(content, x))
		print content.parent
		#print type(content)
		if content is not None:
			#todo test all function to find the best
			#print content.string
			#print content.contents
			#print content.getText()
			#print content.get_text() #Get all child strings, concatenated using the given separator.
			#print content.getText()
			#print content.Text()
			#print content.Name()
			#print content.strings # Get all child strings, concatenated using the given separator.
			
			print "title", soup.title.string
			
			with open("wiki_content.html", "wb") as myfile:
				
				myfile.write(EntitySubstitution.substitute_html(unicode(content)).encode("UTF-8"))
				#myfile.write(EntitySubstitution.substitute_html(unicode(content)).encode("UTF-8"))
				#myfile.write(unicode(content.prettify(formatter="html")).encode("UTF-8"))
				"""
				for elem in content.contents:
					print type(elem)
					myfile.write(elem.encode("UTF-8"))
				"""
			print "content",type(content)
			#self.content = content.string
			self.content = content.contents
			print type(self.content)
			print len(self.content)
Пример #7
0
    def decode(self, indent_level=None,
               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
               formatter="minimal"):
        
        attrs = []
        if self.attrs:
            for key, val in sorted(self.attrs.items()):
                if val is None:
                    decoded = key
                else:
                    if isinstance(val, list) or isinstance(val, tuple):
                        val = ' '.join(val)
                    elif not isinstance(val, basestring):
                        val = unicode(val)
                    elif (
                        isinstance(val, AttributeValueWithCharsetSubstitution)
                        and eventual_encoding is not None):
                        val = val.encode(eventual_encoding)

                    text = self.format_string(val, formatter)
                    decoded = (
                        unicode(key) + '='
                        + EntitySubstitution.quoted_attribute_value(text))
                attrs.append(decoded)
        close = ''
        closeTag = ''

        prefix = ''
        if self.prefix:
            prefix = self.prefix + ":"

        if self.is_empty_element:
            close = '/'
        else:
            closeTag = '</%s%s>' % (prefix, self.name)

        pretty_print = (indent_level is not None)
        if pretty_print:
            space = (' ' * (indent_level - 1))
            indent_contents = indent_level + 1
        else:
            space = ''
            indent_contents = None
        contents = self.decode_contents(
            indent_contents, eventual_encoding, formatter)

        if self.hidden:
            
            s = contents
        else:
            s = []
            attribute_string = ''
            if attrs:
                attribute_string = ' ' + ' '.join(attrs)
            if pretty_print:
                s.append(space)
            s.append('<%s%s%s%s>' % (
                    prefix, self.name, attribute_string, close))
            if pretty_print:
                s.append("\n")
            s.append(contents)
            if pretty_print and contents and contents[-1] != "\n":
                s.append("\n")
            if pretty_print and closeTag:
                s.append(space)
            s.append(closeTag)
            if pretty_print and closeTag and self.next_sibling:
                s.append("\n")
            s = ''.join(s)
        return s
def uppercase_and_substitute_html_entities(string):
    #string = string.encode('utf-8')
    HTMFormatted = EntitySubstitution.substitute_html(string)
    HTMFormatted = HTMFormatted.replace('\n', '')
    #print "str = [", string , "] HTMFormatted = [", HTMFormatted, "]"
    return HTMFormatted
Пример #9
0
    def decode(self, indent_level=None,
               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
               substitute_html_entities=False):
        """Returns a Unicode representation of this tag and its contents.

        :param eventual_encoding: The tag is destined to be
           encoded into this encoding. This method is _not_
           responsible for performing that encoding. This information
           is passed in so that it can be substituted in if the
           document contains a <META> tag that mentions the document's
           encoding.
        """
        attrs = []
        if self.attrs:
            for key, val in sorted(self.attrs.items()):
                if val is None:
                    decoded = key
                else:
                    if not isinstance(val, basestring):
                        val = str(val)
                    if (self.contains_substitutions
                        and eventual_encoding is not None
                        and '%SOUP-ENCODING%' in val):
                        val = self.substituteEncoding(val, eventual_encoding)

                    decoded = (key + '='
                               + EntitySubstitution.substitute_xml(val, True))
                attrs.append(decoded)
        close = ''
        closeTag = ''
        if self.is_empty_element:
            close = ' /'
        else:
            closeTag = '</%s>' % self.name

        pretty_print = (indent_level is not None)
        if pretty_print:
            space = (' ' * (indent_level-1))
            indent_contents = indent_level + 1
        else:
            space = ''
            indent_contents = None
        contents = self.decode_contents(
            indent_contents, eventual_encoding, substitute_html_entities)

        if self.hidden:
            # This is the 'document root' object.
            s = contents
        else:
            s = []
            attributeString = ''
            if attrs:
                attributeString = ' ' + ' '.join(attrs)
            if pretty_print:
                s.append(space)
            s.append('<%s%s%s>' % (self.name, attributeString, close))
            if pretty_print:
                s.append("\n")
            s.append(contents)
            if pretty_print and contents and contents[-1] != "\n":
                s.append("\n")
            if pretty_print and closeTag:
                s.append(space)
            s.append(closeTag)
            if pretty_print and closeTag and self.nextSibling:
                s.append("\n")
            s = ''.join(s)
        return s
Пример #10
0
 def output_ready(self, substitute_html_entities=False):
     if substitute_html_entities:
         output = EntitySubstitution.substitute_html(self)
     else:
         output = self
     return self.PREFIX + output + self.SUFFIX
Пример #11
0
    html.append(body) 

    return html

def cute_country(country_code):
    cute = {
#        'CA': "🇨🇦",
        'CA': "🏒",
        'CH': "🇨🇭",
        'US': "🇺🇸",
        "TW": "🐉",
    }
    return cute.get(country_code, country_code)


escaper = EntitySubstitution()
def cute_stype(x):
    cute = {
        'laptop': u"💻",
        'duckiebot': u"🚗"
    }
    s = cute.get(x, x)
    return escaper.substitute_html(s)
    
        

if __name__ == '__main__':
    filename = sys.argv[1]
    if len(sys.argv) >= 3:
        output = sys.argv[2]
    else:
Пример #12
0
# @viticci
# A simple HTML encoder for clipboard contents
# -*- coding: utf-8 -*-

import clipboard
text = clipboard.get()
from bs4.dammit import EntitySubstitution
print EntitySubstitution.substitute_html(text)
Пример #13
0
    def output_dom(self, tag):
        attrs = []
        if tag.attrs:
            for key, val in tag.attrs.iteritems():
                if val is None:
                    decoded = key
                else:
                    if isinstance(val, list) or isinstance(val, tuple):
                        val = ' '.join(val)
                    elif not isinstance(val, basestring):
                        val = unicode(val)
                    elif isinstance(val,
                                    AttributeValueWithCharsetSubstitution):
                        val = val.encode('utf-8')

                    text = tag.format_string(val)
                    decoded = (
                        unicode(key) + '='
                        + EntitySubstitution.quoted_attribute_value(text))
                attrs.append(decoded)
        close = ''
        close_tag = ''

        if tag.is_empty_element:
            close = '/'
        else:
            close_tag = '</%s>' % tag.name
        attribute_string = ''
        if attrs:
            attribute_string = ' ' + ' '.join(attrs)
        self.dom.write('<%s%s%s>' % (tag.name, attribute_string, close))
        is_visible_string = lambda s: (isinstance(s, NavigableString) and
                                       not isinstance(s, Comment))
        if tag.contents:
            has_print = False
            if tag.name == 'pre':
                self.dom.write(tag.encode_contents(encoding='utf-8'))
            elif len(tag.contents) == 1:
                sub_tag = tag.contents[0]
                if isinstance(sub_tag, Tag):
                    if not has_print:
                        self.dom.write('\n')
                        has_print = True
                    self.output_dom(sub_tag)
                elif is_visible_string(sub_tag):
                    self.dom.write(sub_tag.output_ready(formatter='html')
                                   .strip().encode('utf-8'))
            else:
                for sub_tag in tag.contents:
                    if isinstance(sub_tag, Tag):
                        if not has_print:
                            self.dom.write('\n')
                            has_print = True
                        self.output_dom(sub_tag)
                    elif is_visible_string(sub_tag) and not sub_tag.isspace():
                        prefix = postfix = ''
                        if sub_tag[0].isspace():
                            prefix = ' '
                        if sub_tag[-1].isspace():
                            postfix = ' '
                        self.dom.write(
                            prefix + sub_tag.output_ready(formatter='html')
                            .strip().encode('utf-8') + postfix)
        self.dom.write(close_tag)
        self.dom.write('\n')
Пример #14
0
	def decode(self, indent_level=None,
			   eventual_encoding=DEFAULT_OUTPUT_ENCODING,
			   formatter="minimal"):
		"""Returns a Unicode representation of this tag and its contents.

		:param eventual_encoding: The tag is destined to be
		   encoded into this encoding. This method is _not_
		   responsible for performing that encoding. This information
		   is passed in so that it can be substituted in if the
		   document contains a <META> tag that mentions the document's
		   encoding.
		"""

		# First off, turn a string formatter into a function. This
		# will stop the lookup from happening over and over again.
		if not callable(formatter):
			formatter = self._formatter_for_name(formatter)

		attrs = []
		if self.attrs:
			for key, val in sorted(self.attrs.items()):
				if val is None:
					decoded = key
				else:
					if isinstance(val, list) or isinstance(val, tuple):
						val = ' '.join(val)
					elif not isinstance(val, basestring):
						val = unicode(val)
					elif (
						isinstance(val, AttributeValueWithCharsetSubstitution)
						and eventual_encoding is not None):
						val = val.encode(eventual_encoding)

					text = self.format_string(val, formatter)
					decoded = (
						unicode(key) + '='
						+ EntitySubstitution.quoted_attribute_value(text))
				attrs.append(decoded)
		close = ''
		closeTag = ''

		prefix = ''
		if self.prefix:
			prefix = self.prefix + ":"

		if self.is_empty_element:
			close = '/'
		else:
			closeTag = '</%s%s>' % (prefix, self.name)

		pretty_print = self._should_pretty_print(indent_level)
		space = ''
		indent_space = ''
		if indent_level is not None:
			indent_space = (' ' * (indent_level - 1))
		if pretty_print:
			space = indent_space
			indent_contents = indent_level + 1
		else:
			indent_contents = None
		contents = self.decode_contents(
			indent_contents, eventual_encoding, formatter)

		if self.hidden:
			# This is the 'document root' object.
			s = contents
		else:
			s = []
			attribute_string = ''
			if attrs:
				attribute_string = ' ' + ' '.join(attrs)
			if indent_level is not None:
				# Even if this particular tag is not pretty-printed,
				# we should indent up to the start of the tag.
				s.append(indent_space)
			s.append('<%s%s%s%s>' % (
					prefix, self.name, attribute_string, close))
			if pretty_print:
				s.append("\n")
			s.append(contents)
			if pretty_print and contents and contents[-1] != "\n":
				s.append("\n")
			if pretty_print and closeTag:
				s.append(space)
			s.append(closeTag)
			if indent_level is not None and closeTag and self.next_sibling:
				# Even if this particular tag is not pretty-printed,
				# we're now done with the tag, and we should add a
				# newline if appropriate.
				s.append("\n")
			s = ''.join(s)
		return s
Пример #15
0
def substitute_html_entities(str):
    # return EntitySubstitution.substitute_html(str)
    # return EntitySubstitution.substitute_html(str).replace("&acirc;&euro;&trade;","\'").replace("&acirc;&euro;&oelig;","\"").replace("&acirc;&euro;","\"").replace("&gt",">")\
    return EntitySubstitution.substitute_html(str).replace("&ldquo;","\"").replace("&rdquo;","\"").replace("&rsquo;","'")
Пример #16
0
def envioMail():
    print "Comenzamos envioMail"
    import base64
    from email.mime.multipart import MIMEMultipart
    from email.mime.text import MIMEText
    from email.mime.image import MIMEImage
    # Establecemos conexion con el servidor smtp de gmail
    mailServer = smtplib.SMTP('smtp.gmail.com',587)
    mailServer.ehlo()
    mailServer.starttls()
    mailServer.ehlo()
    password = base64.b64decode("Q29uc3RhbmNpYTIx")
    mailServer.login("*****@*****.**",password)
    # Construimos un mensaje Multipart, con un texto y una imagen adjunta
	# Establecemos la cuentadesde
    cuentaDesde = "*****@*****.**"



    from pymongo import MongoClient as Connection
    from pymongo import DESCENDING


    cadenaCon= 'mongodb://*****:*****@ds029635.mlab.com:29635/othesoluciones1'
    MONGODB_URI =cadenaCon
    MONGODB_URI = 'mongodb://*****:*****@ds029635.mlab.com:29635/othesoluciones1'


    db = Connection(MONGODB_URI).othesoluciones1

    import datetime
    import numpy as np
    import pandas as pd
    fecha = (datetime.date.today()+datetime.timedelta(days=0)).strftime('%d/%m/%Y')
    fecha = datetime.datetime.strptime(fecha,'%d/%m/%Y')
    print "Fecha de hoy-->", fecha
    dfmm = pd.DataFrame()
    for doc in db.coleccion_notificaciones.find():
        if ((datetime.datetime.strptime(doc['fdesde'],'%d/%m/%Y')<= fecha) and (fecha <= datetime.datetime.strptime(doc['fhasta'],'%d/%m/%Y'))):

            df_aux=pd.DataFrame([doc['email'],doc['municipio'], doc['fhasta']])

            dfmm= dfmm.append(df_aux.T, ignore_index=True)
            

    print "****************************************************************"



    from lxml import etree
    import time
    doc=etree.parse("static/Municipios/madrid.xml")
    muni=doc.findall("municipio")

    print dfmm
    if (len(dfmm)>0):
		print "Existen notificaciones que enviar"
		#Obtenemos la lista de emails distintos
		for j in dfmm[0].unique():
		    # Construimos un mensaje Multipart, en el que vamos a incluir texto y una imagen adjunta
			# El cuerpo del texto del mensaje dependera del numero de suscripciones activas que tenga un usuario para el dia actual
			texto=""
			mensaje = MIMEMultipart()
			mensaje['From']=cuentaDesde
			cuentaPara=j
			mensaje['To']=cuentaPara
			for i in range(0, len(dfmm)):                 
				if (dfmm.ix[i,0]==j):
					for k in range(0,len(muni)):
						if (muni[k].attrib["value"][-5:]==dfmm.ix[i,1]):
							hoy = (datetime.date.today()+datetime.timedelta(days=0)).strftime('%d-%m-%Y')
							manana=(datetime.date.today()+datetime.timedelta(days=1)).strftime('%d-%m-%Y')
							pasadomanana=(datetime.date.today()+datetime.timedelta(days=2)).strftime('%d-%m-%Y')
							collection1 = db.PrediccionOTHE
							name2 =  elimina_tildes(unicode(muni[k].text[:]))
							cursor1 = collection1.find_one({"Municipio": name2})
							predHoy = cursor1["Alerta "+hoy]
							predManana= cursor1["Alerta "+manana]
							predPasadoManana=cursor1["Alerta "+pasadomanana]
							from bs4.dammit import EntitySubstitution
							unsubbed = unicode(muni[k].text[:])
							esub = EntitySubstitution()
							subbed = esub.substitute_html(unsubbed)
							print "Activa hasta el: ", dfmm.ix[i,2]
							fhasta = str(dfmm.ix[i,2]).replace("/","-")
							texto = texto+str("<h3>"+subbed+":</h3><p> </p>")
							texto = texto+str("<p>El Nivel de Alerta de Gram&iacute;neas para el d&iacute;a " +hoy+" es: <b>"+str((predHoy))+"</b></p>")
							texto = texto+str("<p>El Nivel de Alerta de Gram&iacute;neas para el d&iacute;a " +manana+" es: <b>"+str((predManana))+"</b></p>")
							texto = texto+str("<p>El Nivel de Alerta de Gram&iacute;neas para el d&iacute;a " +pasadomanana+" es: <b>"+str((predPasadoManana))+"</b></p>")
							if (hoy!=fhasta):
							   texto = texto+str("<p>Recibir&aacute; esta notificaci&oacute;n hasta el: <b>"+fhasta+"</b></p>")
							else:
							    texto = texto+str("<p>Hoy d&iacute;a <b>"+fhasta+"</b> es el &uacute;ltimo en el que recibir&aacute; esta notificaci&oacute;n</p>")
							texto = texto+str("<hr>")
							
			#Establecemos el Asunto del Email
			mensaje['Subject']= hoy+". Servicio de Notificaciones"
			#Establecemos el texto comun de los emails
			html_inic = """\
				<html>
					<head></head>
					<body>
					<p>Buenos d&iacute;as,</p>
					<p>Estas son las notificaciones que ha solicitado:</p><br></br>"""  
			html_fin="""\
			    <br></br>
				<p>Deseamos que pase un gran d&iacute;a.</p>
				<p>Para m&aacute;s informaci&oacute;n puede consultar nuestra web: http://gramineas-madrid.herokuapp.com/</p>
				<p>Reciba un cordial saludo por parte del equipo de Othe Soluciones</p>
				<img src="cid:logo" alt="Othe Soluciones" height="52" width="52"></img>
				</html>"""
			#Y lo juntamos en una cadena
			html=str(html_inic+texto+html_fin)
			
			#Montamos todo el cuerpo del mensaje
			mensaje.attach(MIMEText(html,'html'))
			
			# Adjuntamos la imagen
			file = open("static/style/logo.jpg", "rb")			
			contenido = MIMEImage(file.read())
			contenido.add_header('Content-ID', '<logo>')
			mensaje.attach(contenido)
			print "Envio mail a: ", cuentaPara
			# Enviamos el correo, con los campos from y to.
			mailServer.sendmail(cuentaDesde, cuentaPara, mensaje.as_string())
		# Cierre de la conexion
		mailServer.close()
		print "Fin de envioMail con emails enviados"
    else:
	    # Cierre de la conexion
		mailServer.close()
		print "Fin de envioMail no habia emails que enviar"
Пример #17
0
    def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"):
        """Returns a Unicode representation of this tag and its contents.

        :param eventual_encoding: The tag is destined to be
           encoded into this encoding. This method is _not_
           responsible for performing that encoding. This information
           is passed in so that it can be substituted in if the
           document contains a <META> tag that mentions the document's
           encoding.
        """
        attrs = []
        if self.attrs:
            for key, val in sorted(self.attrs.items()):
                if val is None:
                    decoded = key
                else:
                    if isinstance(val, list) or isinstance(val, tuple):
                        val = " ".join(val)
                    elif not isinstance(val, basestring):
                        val = str(val)
                    if self.contains_substitutions and eventual_encoding is not None and "%SOUP-ENCODING%" in val:
                        val = self.substitute_encoding(val, eventual_encoding)

                    decoded = str(key) + "=" + EntitySubstitution.substitute_xml(val, True)
                attrs.append(decoded)
        close = ""
        closeTag = ""
        if self.is_empty_element:
            close = "/"
        else:
            closeTag = "</%s>" % self.name

        prefix = ""
        if self.prefix:
            prefix = self.prefix + ":"

        pretty_print = indent_level is not None
        if pretty_print:
            space = " " * (indent_level - 1)
            indent_contents = indent_level + 1
        else:
            space = ""
            indent_contents = None
        contents = self.decode_contents(indent_contents, eventual_encoding, formatter)

        if self.hidden:
            # This is the 'document root' object.
            s = contents
        else:
            s = []
            attribute_string = ""
            if attrs:
                attribute_string = " " + " ".join(attrs)
            if pretty_print:
                s.append(space)
            s.append("<%s%s%s%s>" % (prefix, self.name, attribute_string, close))
            if pretty_print:
                s.append("\n")
            s.append(contents)
            if pretty_print and contents and contents[-1] != "\n":
                s.append("\n")
            if pretty_print and closeTag:
                s.append(space)
            s.append(closeTag)
            if pretty_print and closeTag and self.next_sibling:
                s.append("\n")
            s = "".join(s)
        return s
Пример #18
0
 def _html_entities(self, string):
     if '&' in string:
         return string
     else:
         return EntitySubstitution.substitute_html(string)
Пример #19
0
from bs4.dammit import EntitySubstitution

esub = EntitySubstitution()


def sanitize_html(title):
    return esub.substitute_html(title)


def sanitize_irc(title):
    badchars = "\r\n\x01"
    return "".join(c for c in title if c not in badchars)


escapers = {"html": sanitize_html, "irc": sanitize_irc}


def escape(title, mode):
    if not mode:
        mode = "irc"

    if mode == "all":
        for func in list(escapers.values()):
            title = func(title)
        return title

    return escapers.get(mode, lambda title: title)(title)
Пример #20
0
 def output_ready(self, substitute_html_entities=False):
     if substitute_html_entities:
         output = EntitySubstitution.substitute_html(self)
     else:
         output = self
     return self.PREFIX + output + self.SUFFIX
Пример #21
0
    def decode(self,
               indent_level=None,
               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
               substitute_html_entities=False):
        """Returns a Unicode representation of this tag and its contents.

        :param eventual_encoding: The tag is destined to be
           encoded into this encoding. This method is _not_
           responsible for performing that encoding. This information
           is passed in so that it can be substituted in if the
           document contains a <META> tag that mentions the document's
           encoding.
        """
        attrs = []
        if self.attrs:
            for key, val in sorted(self.attrs.items()):
                if val is None:
                    decoded = key
                else:
                    if not isinstance(val, basestring):
                        val = str(val)
                    if (self.contains_substitutions
                            and eventual_encoding is not None
                            and '%SOUP-ENCODING%' in val):
                        val = self.substituteEncoding(val, eventual_encoding)

                    decoded = (key + '=' +
                               EntitySubstitution.substitute_xml(val, True))
                attrs.append(decoded)
        close = ''
        closeTag = ''
        if self.is_empty_element:
            close = ' /'
        else:
            closeTag = '</%s>' % self.name

        pretty_print = (indent_level is not None)
        if pretty_print:
            space = (' ' * (indent_level - 1))
            indent_contents = indent_level + 1
        else:
            space = ''
            indent_contents = None
        contents = self.decode_contents(indent_contents, eventual_encoding,
                                        substitute_html_entities)

        if self.hidden:
            # This is the 'document root' object.
            s = contents
        else:
            s = []
            attributeString = ''
            if attrs:
                attributeString = ' ' + ' '.join(attrs)
            if pretty_print:
                s.append(space)
            s.append('<%s%s%s>' % (self.name, attributeString, close))
            if pretty_print:
                s.append("\n")
            s.append(contents)
            if pretty_print and contents and contents[-1] != "\n":
                s.append("\n")
            if pretty_print and closeTag:
                s.append(space)
            s.append(closeTag)
            if pretty_print and closeTag and self.nextSibling:
                s.append("\n")
            s = ''.join(s)
        return s
Пример #22
0
import time
import string
from twitter import *
import yweather
import codecs
import io
import shutil
import sys
from HTMLParser import HTMLParser # HTML unescaping via standard lib
from bs4.dammit import EntitySubstitution # HTML escaping, via BeautifulSoup4

reload(sys)
sys.setdefaultencoding('utf-8')

htmlparser = HTMLParser()
esub = EntitySubstitution()

html = """
	<!DOCTYPE html>
<html >
  <head>
    <meta charset="UTF-8">
    <title>Bhagya</title>
    
    
    
    
        <style type="text/css">

              #mtabs_wrapper {
                  width: 100%%;
Пример #23
0
# -*- coding: utf-8 -*-
from __future__ import print_function
import clipboard
text = clipboard.get()
from bs4.dammit import EntitySubstitution
print(EntitySubstitution.substitute_html(text))

Пример #24
0
def encode_url(string):
    encoder = EntitySubstitution()
    return encoder.substitute_html(string)
Пример #25
0
                     #tempo[cle] = []
                     tempo[cle2].append(tempoClass[cle2])
                     tempo2[cle2].append(tempoClass[cle2])
                 else:
                     tempo[cle2] = []
                     tempo2[cle2] = []
                     tempo[cle2].append(tempoClass[cle2])
                     tempo2[cle2].append(tempoClass[cle2])
     else:
         temp = unicode(' '.join(brev[cle]))
         tempo[cle] = temp
         tempo2[cle] = brev[cle]
 elif cle == 'titre':
     temp = unicode(brev[cle]).replace('[', '').replace(
         ']', '').lower().capitalize()
     formate = EntitySubstitution()
     soup = bs4.BeautifulSoup(temp)
     temp = soup.text
     tempo[cle] = temp
     #tempo2 [cle] = temp
 elif cle == 'date':
     tempo[cle] = str(brev['date'].year) + '-' + str(
         brev['date'].month) + '-' + str(brev['date'].day)
     tempo2[cle] = str(
         brev['date'].year)  # just the year in Pivottable
 elif cle == 'classification' and brev['classification'] != u'':
     tempoClass = OPS2NetUtils2.ExtractClassificationSimple2(
         brev['classification'])
     for cle in tempoClass.keys():
         if cle in tempo.keys(
         ) and tempoClass[cle] not in tempo[cle]:
Пример #26
0
def main():
    escaper = EntitySubstitution()
    form = cgi.FieldStorage()
    print("Content-type: text/html\n\n")

    print('''
                <html>
                <head>
                <title>BitsxlaMarato 2020 - La FrancoArgentina Team</title>
                <meta http-equiv="Content-Type" content="text/html; charset=utf-8">

                <script type="text/javascript" src="/jquery/jquery-3.3.1.min.js"></script>

                <link rel="stylesheet" type="text/css" href="/jquery/DataTables/bootstrap.min.css"/>
                <link href="/jquery/DataTables/DataTables-1.10.18/css/jquery.dataTables.css" rel="stylesheet" type="text/css" />
                <script src="/jquery/DataTables/DataTables-1.10.18/js/jquery.dataTables.js"></script>
                <link rel="stylesheet" type="text/css" href="/jquery/DataTables/dataTables.bootstrap.min.css"/>
                <script src="stylesheet" type="text/css" href="/jquery/DataTables/dataTables.js"/></script>
                
		<style>

    			.blue-button {
        			display: inline-block;
        			-webkit-box-sizing: content-box;
        			-moz-box-sizing: content-box;
        			box-sizing: content-box;
        			cursor: pointer;
        			padding: 5px 15px;
        			border: 1px solid #018dc4;
        			-webkit-border-radius: 3px;
        			border-radius: 3px;
        			font: normal 16px/normal "Times New Roman", Times, serif;
        			color: rgba(255,255,255,0.9);
        			-o-text-overflow: clip;
        			text-overflow: clip;
        			background: #787A7D;
        			-webkit-box-shadow: 3px 3px 5px 0 rgba(0,0,0,0.2) ;
        			box-shadow: 3px 3px 5px 0 rgba(0,0,0,0.2) ;
        			text-shadow: -1px -1px 0 rgba(15,73,168,0.66) ;
        			-webkit-transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1);
        			-moz-transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1);
        			-o-transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1);
        			transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1);
    			}

    			.light-blue-button {
        			display: inline-block;
        			-webkit-box-sizing: content-box;
        			-moz-box-sizing: content-box;
        			box-sizing: content-box;
        			cursor: pointer;
        			padding: 2px 8px;
        			border: 1px solid #018dc4;
        			-webkit-border-radius: 3px;
        			border-radius: 3px;
        			font: normal 12px/normal "Times New Roman", Times, serif;
        			color: rgba(255,255,255,0.9);
        			-o-text-overflow: clip;
        			text-overflow: clip;
        			background: #a6cfe0;
        			-webkit-box-shadow: 3px 3px 5px 0 rgba(0,0,0,0.2) ;
        			box-shadow: 3px 3px 5px 0 rgba(0,0,0,0.2) ;
        			text-shadow: -1px -1px 0 rgba(15,73,168,0.66) ;
        			-webkit-transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1);
        			-moz-transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1);
        			-o-transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1);
        			transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1);
    			}

    			.text-input {
        			display: inline-block;
        			-webkit-box-sizing: content-box;
        			-moz-box-sizing: content-box;
        			box-sizing: content-box;
        			padding: 4px 10px;
        			border: 1px solid #b7b7b7;
					margin-bottom: 30px;
        			-webkit-border-radius: 3px;
        			border-radius: 3px;
        			font: normal 16px/normal "Times New Roman", Times, serif;
        			color: rgba(0,142,198,1);
        			-o-text-overflow: clip;
        			text-overflow: clip;
        			letter-spacing: 1px;
        			word-spacing: 2px;
        			background: rgba(234,234,234,1);
        			-webkit-box-shadow: 2px 2px 2px 0 rgba(0,0,0,0.2) inset;
        			box-shadow: 2px 2px 2px 0 rgba(0,0,0,0.2) inset;
        			text-shadow: 1px 1px 0 rgba(255,255,255,0.66) ;
        			-webkit-transition: all 200ms cubic-bezier(0.42, 0, 0.58, 1);
        			-moz-transition: all 200ms cubic-bezier(0.42, 0, 0.58, 1);
        			-o-transition: all 200ms cubic-bezier(0.42, 0, 0.58, 1);
        			transition: all 200ms cubic-bezier(0.42, 0, 0.58, 1);
    			}

			#title-1 {
				font-family: Verdana, Geneva, sans-serif;
				font-size: 24px;
				letter-spacing: 0.4px;
				word-spacing: 0px;
				color: #000000;
				font-weight: 700;
				text-decoration: none;
				font-style: normal;
				font-variant: normal;
				text-transform: none;
			}

			#title-2 {
				font-family: Verdana, Geneva, sans-serif;
				font-size: 20px;
				letter-spacing: 0.4px;
				word-spacing: 0px;
				color: #000000;
				font-weight: 700;
				text-decoration: none;
				font-style: normal;
				font-variant: normal;
				text-transform: none;
				vertical-align: middle;
				text-align: center;
			}

			#title-3 {
				font-family: Verdana, Geneva, sans-serif;
				font-size: 12px;
				letter-spacing: 0.4px;
				word-spacing: 0px;
				color: #000000;
				font-weight: 700;
				text-decoration: none;
				font-style: normal;
				font-variant: normal;
				text-transform: none;
			}

			#all-content {
				margin: auto;
			}

			.center {
				text-align: center;
			}

			.row {
				min-height: 100px;
				position: relative;
				text-align: center;
			}

			.column_center {
  				display: inline-block;
  				padding: 20px;
  				border:1px solid red;
			}

			label {
  				float: center;
  				margin: 10 30px;
			}


		</style>

                </head>


    ''')
    print('''<body>
            <div id="all-content">
            <div class="row">
            <div id="title-2">BitsxlaMarato 2020 - La FrancoArgentina Team</div>
            <br>
            <br>
            ''')

    if form.getfirst('action_on_post', None) == "clinical_description":
        case_id, case_desc, doc_simil_clust, top_cluster, topics_all_clusters, topics_top_cluster = find_cluster_newcase(
            case_id=None, case_desc=form.getfirst('clinical_desc'))
        abc = 1
    elif form.getfirst('action_on_post', None) == "case_id":
        case_id, case_desc, doc_simil_clust, top_cluster, topics_all_clusters, topics_top_cluster = find_cluster_newcase(
            case_id=form.getfirst('clinical_id'), case_desc=None)
        abc = 1
    elif form.getfirst('action_on_post', None) == "keyword":
        clusters, topics_all_clusters = find_keyword(
            keyword=form.getfirst('keyword'))
        abc = 2
    else:
        print(''' Nothing to do ''')

    if abc == 1:
        print("<table border=0>")

        print(f"<tr style='text-align:left'>")
        print("<td style='width:20%; vertical-align:top'>")
        print("<label>")
        print(f"<b>Case ID:</b>")
        print("</label>")
        print("</td>")
        print("<td style='width:85%'>")
        print(f"<b>{str(case_id)}")
        print("</td>")
        print("</tr>")

        print(f"<tr style='text-align:left'>")
        print("<td style='width:20%; vertical-align:top'>")
        print("<label>")
        print(f"<b>Case description:</b>")
        print("</label>")
        print("</td>")
        print("<td style='width:85%'>")
        print(f"{escaper.substitute_html(case_desc)}")
        print("</td>")
        print("</tr>")

        print(f"<tr style='text-align:left'>")
        print("<td style='width:20%; vertical-align:top'>")
        print("<label>")
        print(f"<b>Assigned to cluster:</b>")
        print("</label>")
        print("</td>")
        print("<td style='width:85%'>")
        print(f"{escaper.substitute_html(str(top_cluster))}")
        print("</td>")
        print("</tr>")

        print(f"<tr style='text-align:left'>")
        print("<td style='width:20%; vertical-align:top'>")
        print("<label>")
        print("<b>Topics in the assigned cluster:</b>")
        print("</label>")
        print("</td>")
        print("<td style='width:85%'>")
        for index, row in topics_top_cluster.iterrows():
            print(f"{escaper.substitute_html(str(row[1]))}<br>")
        print(f"</td>")
        print(f"</tr>")

        print(f"<tr style='text-align:left'>")
        print("<td style='width:20%; vertical-align:top'>")
        print("<label>")
        print("<b>Mean pairwise distance to each cluster:</b>")
        print("</label>")
        print("</td>")
        print("<td>")
        print("<table border='0'>")
        print("<tr>")
        print("<td style='width:45%; text-align:center'><b>Cluster</b></td>")
        print("<td style='width:55%; text-align:center'><b>Distance</b></td>")
        print("</tr>")
        for index, row in doc_simil_clust.iterrows():
            print("<tr>")
            print(
                f"<td style='width:45%; text-align:center'><b>{escaper.substitute_html(str(int(row[0])))}</b></td>"
            )
            print(
                f"<td style='width:55%; text-align:right'>{escaper.substitute_html(str(row[1]))}</td>"
            )
            print(f"</tr>")
        print("</table>")
        print("</td>")
        print("</table>")
        print("<hr>")

        print("<table border=0>")
        print("<tr style='text-align:left'>")
        print("<td style='width:20%; vertical-align:top'>")
        print("<label>")
        print("<b>Topics in all clusters:</caption>")
        print("</label>")
        print("</td>")
        print("<td>")
        print("<table border='0'>")
        print("<tr>")
        print("<th style='text-align:center'><b>Cluster</b></th>")
        print("<th style='text-align:center'><b>Topics</b></th>")
        print("</tr>")
        a = None
        for index, row in topics_all_clusters.iterrows():
            if (a is None):
                print(f"<tr>")
                print(
                    f"<td style='text-align:center'>{escaper.substitute_html(str(row[0]))}</td>"
                )
                print("<td>")
                print(f"{escaper.substitute_html(str(row[1]))}<br>")
                a = row[0]
            if (a != row[0]):
                print("</td>")
                print(f"</tr>")
                print("<tr><td colspan=2><hr></td></tr>")
                print(f"<tr>")
                print(
                    f"<td style='text-align:center'>{escaper.substitute_html(str(row[0]))}</td>"
                )
                print("<td>")
                print(f"{escaper.substitute_html(str(row[1]))}<br>")
                a = row[0]
            else:
                print(f"{escaper.substitute_html(str(row[1]))}<br>")
        print("</td>")
        print(f"</tr>")
        print("</table>")
        print("</td>")
        print("</table>")
        print("<br>")
    else:
        #DTD
        #print(f"Clusters: {clusters.items()}")
        if len(clusters) > 0:
            print(
                "<center><h2>Keyword found in the following clusters:</h2></center>"
            )
            print("<center>")
            print(
                "<table border=0><tr><th calss='text-center'>Cluster</th><th class='text-center'>Topics</th></tr>"
            )
            for row in clusters:
                print(
                    f"<tr><td align='center'><b>{row['cluster']}</b></td><td>{row['topics']}</td></tr>"
                )
            print("</table>")
            print(
                "<center><h3>Look for these clusters below in order to find all the words in each of them.</h3></center>"
            )
            print("</center>")
        else:
            print("<center><h2>Keyword not found in any cluster</h2></center>")
        print("<br><hr>")
        print("<center>")
        print("<b>Topics in all clusters:</caption>")
        print("</label>")
        print("</td>")
        print("<td>")
        print("<table border='0'>")
        print("<tr>")
        print("<th style='text-align:center'><b>Cluster</b></th>")
        print("<th style='text-align:center'><b>Topics</b></th>")
        print("</tr>")
        a = None
        b = None
        #print("<center><h2>All clusters in the model:</h2></center>")
        for index, row in topics_all_clusters.iterrows():
            if (a is None) & (b is None):
                print(f"<tr>")
                print(
                    f"<td style='text-align:center'><b>{escaper.substitute_html(str(row[0]))}</b></td>"
                )
                print("<td>")
                print(f"{escaper.substitute_html(str(row[1]))}<br>")
                a = row[0]
                b = row[1]
            if (a != row[0]) & (b != row[1]):
                print("</td>")
                print(f"</tr>")
                print("<tr><td colspan=2><hr></td></tr>")
                print(f"<tr>")
                print(
                    f"<td style='text-align:center'><b>{escaper.substitute_html(str(row[0]))}</b></td>"
                )
                print("<td>")
                print(f"{escaper.substitute_html(str(row[1]))}<br>")
                a = row[0]
                b = row[1]
            else:
                print(f"{escaper.substitute_html(str(row[1]))}<br>")
        print("</td>")
        print(f"</tr>")
        print("</table>")
        print("</center>")
        print("</td>")
        print("</table>")
        print("<br>")

    print('''
        <hr>
        <center>
                <table border=0 height="100px" width="60%">
                        <tr>
                                <td> <img src="/images/FIB-web.png" height="60%"> </td>
                                <td> <img src="/images/hackers-upc-web.png" height="60%"> </td>
                                <td> <img src="/images/bsc-web.png" height="60%"> </td>
                                <td> <img src="/images/plan-tl-web.png" height="60%"> </td>
                        </tr>
                </table>
        </center>
    ''')

    print("</html>")
Пример #27
0
    def decode(self, indent_level=None,
               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
               formatter="minimal"):
        """Returns a Unicode representation of this tag and its contents.

        :param eventual_encoding: The tag is destined to be
           encoded into this encoding. This method is _not_
           responsible for performing that encoding. This information
           is passed in so that it can be substituted in if the
           document contains a <META> tag that mentions the document's
           encoding.
        """
        attrs = []
        if self.attrs:
            for key, val in sorted(self.attrs.items()):
                if val is None:
                    decoded = key
                else:
                    if isinstance(val, list) or isinstance(val, tuple):
                        val = ' '.join(val)
                    elif not isinstance(val, basestring):
                        val = str(val)
                    elif (
                        isinstance(val, AttributeValueWithCharsetSubstitution)
                        and eventual_encoding is not None):
                        val = val.encode(eventual_encoding)

                    text = self.format_string(val, formatter)
                    decoded = (
                        str(key) + '='
                        + EntitySubstitution.quoted_attribute_value(text))
                attrs.append(decoded)
        close = ''
        closeTag = ''
        if self.is_empty_element:
            close = '/'
        else:
            closeTag = '</%s>' % self.name

        prefix = ''
        if self.prefix:
            prefix = self.prefix + ":"

        pretty_print = (indent_level is not None)
        if pretty_print:
            space = (' ' * (indent_level - 1))
            indent_contents = indent_level + 1
        else:
            space = ''
            indent_contents = None
        contents = self.decode_contents(
            indent_contents, eventual_encoding, formatter)

        if self.hidden:
            # This is the 'document root' object.
            s = contents
        else:
            s = []
            attribute_string = ''
            if attrs:
                attribute_string = ' ' + ' '.join(attrs)
            if pretty_print:
                s.append(space)
            s.append('<%s%s%s%s>' % (
                    prefix, self.name, attribute_string, close))
            if pretty_print:
                s.append("\n")
            s.append(contents)
            if pretty_print and contents and contents[-1] != "\n":
                s.append("\n")
            if pretty_print and closeTag:
                s.append(space)
            s.append(closeTag)
            if pretty_print and closeTag and self.next_sibling:
                s.append("\n")
            s = ''.join(s)
        return s
Пример #28
0
def custom_formatter(string):
    """add &quot; and &apos; to entity substitution"""
    return EntitySubstitution.substitute_html(string).replace(
        '"', '&quot;').replace("'", '&apos;')
Пример #29
0
    def decode(self,
               indent_level=None,
               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
               formatter="minimal"):
        """Returns a Unicode representation of this tag and its contents.

        :param eventual_encoding: The tag is destined to be
           encoded into this encoding. This method is _not_
           responsible for performing that encoding. This information
           is passed in so that it can be substituted in if the
           document contains a <META> tag that mentions the document's
           encoding.
        """
        attrs = []
        if self.attrs:
            for key, val in sorted(self.attrs.items()):
                if val is None:
                    decoded = key
                else:
                    if isinstance(val, list) or isinstance(val, tuple):
                        val = ' '.join(val)
                    elif not isinstance(val, str):
                        val = str(val)
                    elif (isinstance(val,
                                     AttributeValueWithCharsetSubstitution)
                          and eventual_encoding is not None):
                        val = val.encode(eventual_encoding)

                    text = self.format_string(val, formatter)
                    decoded = (str(key) + '=' +
                               EntitySubstitution.quoted_attribute_value(text))
                attrs.append(decoded)
        close = ''
        closeTag = ''
        if self.is_empty_element:
            close = '/'
        else:
            closeTag = '</%s>' % self.name

        prefix = ''
        if self.prefix:
            prefix = self.prefix + ":"

        pretty_print = (indent_level is not None)
        if pretty_print:
            space = (' ' * (indent_level - 1))
            indent_contents = indent_level + 1
        else:
            space = ''
            indent_contents = None
        contents = self.decode_contents(indent_contents, eventual_encoding,
                                        formatter)

        if self.hidden:
            # This is the 'document root' object.
            s = contents
        else:
            s = []
            attribute_string = ''
            if attrs:
                attribute_string = ' ' + ' '.join(attrs)
            if pretty_print:
                s.append(space)
            s.append('<%s%s%s%s>' %
                     (prefix, self.name, attribute_string, close))
            if pretty_print:
                s.append("\n")
            s.append(contents)
            if pretty_print and contents and contents[-1] != "\n":
                s.append("\n")
            if pretty_print and closeTag:
                s.append(space)
            s.append(closeTag)
            if pretty_print and closeTag and self.next_sibling:
                s.append("\n")
            s = ''.join(s)
        return s
def substitute_html_entities(str):
    return EntitySubstitution.substitute_html(str).replace("&ldquo;","\"").replace("&rdquo;","\"").replace("&rsquo;","'")