Exemplo n.º 1
0
def parse_warning_page(page_source, warning):
    """
    Pare source code of a warning HTML page and extract key, name and description.
    """
    # parse HTML page
    soup = BeautifulSoup(page_source, 'html.parser')
    content = soup.find('main')

    # use header, sometimes only message ID
    key = warning['key']
    warning['name'] = name(content.find('h1'), key, key)
    # sometimes better description inside blockquote
    warning['name'] = name(content.select_one('blockquote > p'), key, warning['name'])

    desc = ''
    for paragraph in  content.select('main > p'):
        txt = str(paragraph)
        if 'Compiler Warning ' in warning['name']:
            # compiler messages: first p element is header
            if len(txt) < 200:
                warning['name'] = name(paragraph, key, warning['name'])
            else:
                desc += txt
                break
        else:
            # use only first p block: XML otherwise becomes too large
            desc += txt
            break
    if not desc:
        # repeat header in description to have something
        desc = '<p>'  + EntitySubstitution().substitute_html(warning['name']) + '</p>'
    warning['description'] = desc
    return warning
Exemplo n.º 2
0
    def endElement(self, tag):
        tag, attrs, data = self._history.pop()
        data = data.strip()

        if untranslated._translatable(
                data) and not untranslated._tal_replaced_content(tag, attrs):
            # not enclosed
            if (self._i18nlevel
                    == 0) and tag not in ['script', 'style', 'html']:
                severity = untranslated._severity(tag, attrs) or ''
                if severity:
                    if untranslated.IGNORE_UNTRANSLATED in attrs.keys():
                        # Ignore untranslated data. This is necessary for
                        # including literal content, that does not need to be
                        # translated.
                        pass
                    elif not untranslated.CHAMELEON_SUBST.match(data):
                        h = HTMLParser.HTMLParser()
                        with open(self._filename, 'r') as source_file:
                            bs = BeautifulSoup.BeautifulSoup(
                                source_file, 'html.parser')
                            source_file.close()
                        attr = {}
                        for key in attrs.keys():
                            if key not in ['selected']:
                                attr[key] = attrs.getValue(key)
                        values = bs.findAll(tag.lower(), attrs=attr)
                        if not values:
                            self.log(
                                'i18n:translate missing for this:\n'
                                '"""\n%s\n"""\nTag:<%s> Attrs:%s' %
                                (data.encode('utf8'), tag, attr), severity)
                        for v in values:
                            if not v.has_attr('i18n:translate'):
                                v.name = tag
                                escaper = EntitySubstitution()
                                substitute = copy(v)
                                if v.string:
                                    substitute.string = escaper.substitute_html(
                                        v.string)
                                for i in [v, substitute]:
                                    pattern = h.unescape(str(i))
                                    i['i18n:translate'] = ""
                                    substring = h.unescape(str(i))
                                    match = replace(
                                        self._filename, str(pattern),
                                        str(substring),
                                        self._parser.getLineNumber())
                                    if match:
                                        break
                                if not match:
                                    self.log(
                                        'i18n:translate missing for this:\n'
                                        '"""\n%s\n"""\nPattern: %s' %
                                        (data.encode('utf8'), str(pattern)),
                                        severity)
        if self._i18nlevel != 0:
            self._i18nlevel -= 1
Exemplo n.º 3
0
def save_sent_viz_file(x, name, scores, k, args):
	escaper = EntitySubstitution()

	with open(os.path.join(args.outdir, 'id_to_word.pkl'),'rb') as f:
		id_to_word = pickle.load(f)
	new_data = list()
	new_texts = list()
	with open(os.path.join(args.outdir, 'sent_viz_L2X'+name+'.html'), 'w') as txt_file:
		txt_file.write(u"<!DOCTYPE html>\n<html>\n<body>\n".encode("utf-8"))

		for i, x_single in enumerate(x):
			x_selected = get_selected_words(x_single,
				scores[i], id_to_word, k)

			# new_data.append(x_selected)
			for s_i, s in enumerate(x_single):
				if s != 0:
					break

			# txt_file.write( (u" ".join([id_to_word[i] for i in x_single[s_i:] if i != 0]) + u"\n").encode("utf-8") )

			sent_viz = list()
			for wp, wi in enumerate(x_single[s_i:]):
				# if x_selected[s_i:][wp] != 0:
				# 	placeholder = u"-" * len(id_to_word[wi])
				# else:
				# 	placeholder = u" " * len(id_to_word[wi])
				if x_selected[s_i:][wp] != 0:
					placeholder = u"<mark><strong>" + escaper.substitute_html(id_to_word[wi]) + u"</strong></mark>"
				else:
					placeholder = escaper.substitute_html(id_to_word[wi])

				sent_viz.append(placeholder)

			txt_file.write((u"<p>" + u" ".join(sent_viz) + u"</p><br>\n").encode("utf-8"))
		txt_file.write(u"</body>\n</html>\n".encode("utf-8"))
Exemplo n.º 4
0
                     #tempo[cle] = []
                     tempo[cle2].append(tempoClass[cle2])
                     tempo2[cle2].append(tempoClass[cle2])
                 else:
                     tempo[cle2] = []
                     tempo2[cle2] = []
                     tempo[cle2].append(tempoClass[cle2])
                     tempo2[cle2].append(tempoClass[cle2])
     else:
         temp = unicode(' '.join(brev[cle]))
         tempo[cle] = temp
         tempo2[cle] = brev[cle]
 elif cle == 'titre':
     temp = unicode(brev[cle]).replace('[', '').replace(
         ']', '').lower().capitalize()
     formate = EntitySubstitution()
     soup = bs4.BeautifulSoup(temp)
     temp = soup.text
     tempo[cle] = temp
     #tempo2 [cle] = temp
 elif cle == 'date':
     tempo[cle] = str(brev['date'].year) + '-' + str(
         brev['date'].month) + '-' + str(brev['date'].day)
     tempo2[cle] = str(
         brev['date'].year)  # just the year in Pivottable
 elif cle == 'classification' and brev['classification'] != u'':
     tempoClass = OPS2NetUtils2.ExtractClassificationSimple2(
         brev['classification'])
     for cle in tempoClass.keys():
         if cle in tempo.keys(
         ) and tempoClass[cle] not in tempo[cle]:
Exemplo n.º 5
0
from bs4.dammit import EntitySubstitution

esub = EntitySubstitution()


def sanitize_html(title):
    return esub.substitute_html(title)


def sanitize_irc(title):
    badchars = "\r\n\x01"
    return "".join(c for c in title if c not in badchars)


escapers = {"html": sanitize_html, "irc": sanitize_irc}


def escape(title, mode):
    if not mode:
        mode = "irc"

    if mode == "all":
        for func in list(escapers.values()):
            title = func(title)
        return title

    return escapers.get(mode, lambda title: title)(title)
Exemplo n.º 6
0
    html.append(body) 

    return html

def cute_country(country_code):
    cute = {
#        'CA': "🇨🇦",
        'CA': "🏒",
        'CH': "🇨🇭",
        'US': "🇺🇸",
        "TW": "🐉",
    }
    return cute.get(country_code, country_code)


escaper = EntitySubstitution()
def cute_stype(x):
    cute = {
        'laptop': u"💻",
        'duckiebot': u"🚗"
    }
    s = cute.get(x, x)
    return escaper.substitute_html(s)
    
        

if __name__ == '__main__':
    filename = sys.argv[1]
    if len(sys.argv) >= 3:
        output = sys.argv[2]
    else:
Exemplo n.º 7
0
def main():
    escaper = EntitySubstitution()
    form = cgi.FieldStorage()
    print("Content-type: text/html\n\n")

    print('''
                <html>
                <head>
                <title>BitsxlaMarato 2020 - La FrancoArgentina Team</title>
                <meta http-equiv="Content-Type" content="text/html; charset=utf-8">

                <script type="text/javascript" src="/jquery/jquery-3.3.1.min.js"></script>

                <link rel="stylesheet" type="text/css" href="/jquery/DataTables/bootstrap.min.css"/>
                <link href="/jquery/DataTables/DataTables-1.10.18/css/jquery.dataTables.css" rel="stylesheet" type="text/css" />
                <script src="/jquery/DataTables/DataTables-1.10.18/js/jquery.dataTables.js"></script>
                <link rel="stylesheet" type="text/css" href="/jquery/DataTables/dataTables.bootstrap.min.css"/>
                <script src="stylesheet" type="text/css" href="/jquery/DataTables/dataTables.js"/></script>
                
		<style>

    			.blue-button {
        			display: inline-block;
        			-webkit-box-sizing: content-box;
        			-moz-box-sizing: content-box;
        			box-sizing: content-box;
        			cursor: pointer;
        			padding: 5px 15px;
        			border: 1px solid #018dc4;
        			-webkit-border-radius: 3px;
        			border-radius: 3px;
        			font: normal 16px/normal "Times New Roman", Times, serif;
        			color: rgba(255,255,255,0.9);
        			-o-text-overflow: clip;
        			text-overflow: clip;
        			background: #787A7D;
        			-webkit-box-shadow: 3px 3px 5px 0 rgba(0,0,0,0.2) ;
        			box-shadow: 3px 3px 5px 0 rgba(0,0,0,0.2) ;
        			text-shadow: -1px -1px 0 rgba(15,73,168,0.66) ;
        			-webkit-transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1);
        			-moz-transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1);
        			-o-transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1);
        			transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1);
    			}

    			.light-blue-button {
        			display: inline-block;
        			-webkit-box-sizing: content-box;
        			-moz-box-sizing: content-box;
        			box-sizing: content-box;
        			cursor: pointer;
        			padding: 2px 8px;
        			border: 1px solid #018dc4;
        			-webkit-border-radius: 3px;
        			border-radius: 3px;
        			font: normal 12px/normal "Times New Roman", Times, serif;
        			color: rgba(255,255,255,0.9);
        			-o-text-overflow: clip;
        			text-overflow: clip;
        			background: #a6cfe0;
        			-webkit-box-shadow: 3px 3px 5px 0 rgba(0,0,0,0.2) ;
        			box-shadow: 3px 3px 5px 0 rgba(0,0,0,0.2) ;
        			text-shadow: -1px -1px 0 rgba(15,73,168,0.66) ;
        			-webkit-transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1);
        			-moz-transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1);
        			-o-transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1);
        			transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1);
    			}

    			.text-input {
        			display: inline-block;
        			-webkit-box-sizing: content-box;
        			-moz-box-sizing: content-box;
        			box-sizing: content-box;
        			padding: 4px 10px;
        			border: 1px solid #b7b7b7;
					margin-bottom: 30px;
        			-webkit-border-radius: 3px;
        			border-radius: 3px;
        			font: normal 16px/normal "Times New Roman", Times, serif;
        			color: rgba(0,142,198,1);
        			-o-text-overflow: clip;
        			text-overflow: clip;
        			letter-spacing: 1px;
        			word-spacing: 2px;
        			background: rgba(234,234,234,1);
        			-webkit-box-shadow: 2px 2px 2px 0 rgba(0,0,0,0.2) inset;
        			box-shadow: 2px 2px 2px 0 rgba(0,0,0,0.2) inset;
        			text-shadow: 1px 1px 0 rgba(255,255,255,0.66) ;
        			-webkit-transition: all 200ms cubic-bezier(0.42, 0, 0.58, 1);
        			-moz-transition: all 200ms cubic-bezier(0.42, 0, 0.58, 1);
        			-o-transition: all 200ms cubic-bezier(0.42, 0, 0.58, 1);
        			transition: all 200ms cubic-bezier(0.42, 0, 0.58, 1);
    			}

			#title-1 {
				font-family: Verdana, Geneva, sans-serif;
				font-size: 24px;
				letter-spacing: 0.4px;
				word-spacing: 0px;
				color: #000000;
				font-weight: 700;
				text-decoration: none;
				font-style: normal;
				font-variant: normal;
				text-transform: none;
			}

			#title-2 {
				font-family: Verdana, Geneva, sans-serif;
				font-size: 20px;
				letter-spacing: 0.4px;
				word-spacing: 0px;
				color: #000000;
				font-weight: 700;
				text-decoration: none;
				font-style: normal;
				font-variant: normal;
				text-transform: none;
				vertical-align: middle;
				text-align: center;
			}

			#title-3 {
				font-family: Verdana, Geneva, sans-serif;
				font-size: 12px;
				letter-spacing: 0.4px;
				word-spacing: 0px;
				color: #000000;
				font-weight: 700;
				text-decoration: none;
				font-style: normal;
				font-variant: normal;
				text-transform: none;
			}

			#all-content {
				margin: auto;
			}

			.center {
				text-align: center;
			}

			.row {
				min-height: 100px;
				position: relative;
				text-align: center;
			}

			.column_center {
  				display: inline-block;
  				padding: 20px;
  				border:1px solid red;
			}

			label {
  				float: center;
  				margin: 10 30px;
			}


		</style>

                </head>


    ''')
    print('''<body>
            <div id="all-content">
            <div class="row">
            <div id="title-2">BitsxlaMarato 2020 - La FrancoArgentina Team</div>
            <br>
            <br>
            ''')

    if form.getfirst('action_on_post', None) == "clinical_description":
        case_id, case_desc, doc_simil_clust, top_cluster, topics_all_clusters, topics_top_cluster = find_cluster_newcase(
            case_id=None, case_desc=form.getfirst('clinical_desc'))
        abc = 1
    elif form.getfirst('action_on_post', None) == "case_id":
        case_id, case_desc, doc_simil_clust, top_cluster, topics_all_clusters, topics_top_cluster = find_cluster_newcase(
            case_id=form.getfirst('clinical_id'), case_desc=None)
        abc = 1
    elif form.getfirst('action_on_post', None) == "keyword":
        clusters, topics_all_clusters = find_keyword(
            keyword=form.getfirst('keyword'))
        abc = 2
    else:
        print(''' Nothing to do ''')

    if abc == 1:
        print("<table border=0>")

        print(f"<tr style='text-align:left'>")
        print("<td style='width:20%; vertical-align:top'>")
        print("<label>")
        print(f"<b>Case ID:</b>")
        print("</label>")
        print("</td>")
        print("<td style='width:85%'>")
        print(f"<b>{str(case_id)}")
        print("</td>")
        print("</tr>")

        print(f"<tr style='text-align:left'>")
        print("<td style='width:20%; vertical-align:top'>")
        print("<label>")
        print(f"<b>Case description:</b>")
        print("</label>")
        print("</td>")
        print("<td style='width:85%'>")
        print(f"{escaper.substitute_html(case_desc)}")
        print("</td>")
        print("</tr>")

        print(f"<tr style='text-align:left'>")
        print("<td style='width:20%; vertical-align:top'>")
        print("<label>")
        print(f"<b>Assigned to cluster:</b>")
        print("</label>")
        print("</td>")
        print("<td style='width:85%'>")
        print(f"{escaper.substitute_html(str(top_cluster))}")
        print("</td>")
        print("</tr>")

        print(f"<tr style='text-align:left'>")
        print("<td style='width:20%; vertical-align:top'>")
        print("<label>")
        print("<b>Topics in the assigned cluster:</b>")
        print("</label>")
        print("</td>")
        print("<td style='width:85%'>")
        for index, row in topics_top_cluster.iterrows():
            print(f"{escaper.substitute_html(str(row[1]))}<br>")
        print(f"</td>")
        print(f"</tr>")

        print(f"<tr style='text-align:left'>")
        print("<td style='width:20%; vertical-align:top'>")
        print("<label>")
        print("<b>Mean pairwise distance to each cluster:</b>")
        print("</label>")
        print("</td>")
        print("<td>")
        print("<table border='0'>")
        print("<tr>")
        print("<td style='width:45%; text-align:center'><b>Cluster</b></td>")
        print("<td style='width:55%; text-align:center'><b>Distance</b></td>")
        print("</tr>")
        for index, row in doc_simil_clust.iterrows():
            print("<tr>")
            print(
                f"<td style='width:45%; text-align:center'><b>{escaper.substitute_html(str(int(row[0])))}</b></td>"
            )
            print(
                f"<td style='width:55%; text-align:right'>{escaper.substitute_html(str(row[1]))}</td>"
            )
            print(f"</tr>")
        print("</table>")
        print("</td>")
        print("</table>")
        print("<hr>")

        print("<table border=0>")
        print("<tr style='text-align:left'>")
        print("<td style='width:20%; vertical-align:top'>")
        print("<label>")
        print("<b>Topics in all clusters:</caption>")
        print("</label>")
        print("</td>")
        print("<td>")
        print("<table border='0'>")
        print("<tr>")
        print("<th style='text-align:center'><b>Cluster</b></th>")
        print("<th style='text-align:center'><b>Topics</b></th>")
        print("</tr>")
        a = None
        for index, row in topics_all_clusters.iterrows():
            if (a is None):
                print(f"<tr>")
                print(
                    f"<td style='text-align:center'>{escaper.substitute_html(str(row[0]))}</td>"
                )
                print("<td>")
                print(f"{escaper.substitute_html(str(row[1]))}<br>")
                a = row[0]
            if (a != row[0]):
                print("</td>")
                print(f"</tr>")
                print("<tr><td colspan=2><hr></td></tr>")
                print(f"<tr>")
                print(
                    f"<td style='text-align:center'>{escaper.substitute_html(str(row[0]))}</td>"
                )
                print("<td>")
                print(f"{escaper.substitute_html(str(row[1]))}<br>")
                a = row[0]
            else:
                print(f"{escaper.substitute_html(str(row[1]))}<br>")
        print("</td>")
        print(f"</tr>")
        print("</table>")
        print("</td>")
        print("</table>")
        print("<br>")
    else:
        #DTD
        #print(f"Clusters: {clusters.items()}")
        if len(clusters) > 0:
            print(
                "<center><h2>Keyword found in the following clusters:</h2></center>"
            )
            print("<center>")
            print(
                "<table border=0><tr><th calss='text-center'>Cluster</th><th class='text-center'>Topics</th></tr>"
            )
            for row in clusters:
                print(
                    f"<tr><td align='center'><b>{row['cluster']}</b></td><td>{row['topics']}</td></tr>"
                )
            print("</table>")
            print(
                "<center><h3>Look for these clusters below in order to find all the words in each of them.</h3></center>"
            )
            print("</center>")
        else:
            print("<center><h2>Keyword not found in any cluster</h2></center>")
        print("<br><hr>")
        print("<center>")
        print("<b>Topics in all clusters:</caption>")
        print("</label>")
        print("</td>")
        print("<td>")
        print("<table border='0'>")
        print("<tr>")
        print("<th style='text-align:center'><b>Cluster</b></th>")
        print("<th style='text-align:center'><b>Topics</b></th>")
        print("</tr>")
        a = None
        b = None
        #print("<center><h2>All clusters in the model:</h2></center>")
        for index, row in topics_all_clusters.iterrows():
            if (a is None) & (b is None):
                print(f"<tr>")
                print(
                    f"<td style='text-align:center'><b>{escaper.substitute_html(str(row[0]))}</b></td>"
                )
                print("<td>")
                print(f"{escaper.substitute_html(str(row[1]))}<br>")
                a = row[0]
                b = row[1]
            if (a != row[0]) & (b != row[1]):
                print("</td>")
                print(f"</tr>")
                print("<tr><td colspan=2><hr></td></tr>")
                print(f"<tr>")
                print(
                    f"<td style='text-align:center'><b>{escaper.substitute_html(str(row[0]))}</b></td>"
                )
                print("<td>")
                print(f"{escaper.substitute_html(str(row[1]))}<br>")
                a = row[0]
                b = row[1]
            else:
                print(f"{escaper.substitute_html(str(row[1]))}<br>")
        print("</td>")
        print(f"</tr>")
        print("</table>")
        print("</center>")
        print("</td>")
        print("</table>")
        print("<br>")

    print('''
        <hr>
        <center>
                <table border=0 height="100px" width="60%">
                        <tr>
                                <td> <img src="/images/FIB-web.png" height="60%"> </td>
                                <td> <img src="/images/hackers-upc-web.png" height="60%"> </td>
                                <td> <img src="/images/bsc-web.png" height="60%"> </td>
                                <td> <img src="/images/plan-tl-web.png" height="60%"> </td>
                        </tr>
                </table>
        </center>
    ''')

    print("</html>")
Exemplo n.º 8
0
def encode_url(string):
    encoder = EntitySubstitution()
    return encoder.substitute_html(string)