Exemplo n.º 1
0
def get_context(url, matchtext, before, after):
    html = get_cached_url(url).read()
    textsegments = html_to_text(html)
    i = textsegments.find(matchtext)
    bigtext = textsegments[max(0, i - before):min(i +
                                                  after, len(textsegments))]
    return trim_to_words(bigtext)
Exemplo n.º 2
0
def get_raw_disputes(url):
    """unfiltered and unranked. Return all disputes we find"""
    try:
        htmlcontent = urllib2.urlopen(url, None, 2).read(200000)
        text = html_to_text(htmlcontent)
        return match_with_claims(text)
    except:
        return []
Exemplo n.º 3
0
def get_dispute_context(claimtext):
	try:
		contextobj = ClaimContext.objects.filter(claimtext=claimtext)[0]
		text = html_to_text(contextobj.sentence).strip()
		return {'url':contextobj.url, 'text':text, 'prefix': contextobj.prefix, 'date':contextobj.date,
		'badvotes':contextobj.badvotes, 'goodvotes':contextobj.goodvotes}
	except:
		return {'url':'',"text":'','prefix':'','date':'','badvotes':0,'goodvotes':0}
Exemplo n.º 4
0
def get_raw_disputes(url):
	"""unfiltered and unranked. Return all disputes we find"""
	try:
		htmlcontent = urllib2.urlopen(url,None,2).read(200000)	
		text = html_to_text(htmlcontent)
		return match_with_claims(text)
	except:
		return []
Exemplo n.º 5
0
def get_raw_disputes(url):
    try:
        htmlcontent = urllib2.urlopen(url, None, 2).read(200000)
        text = h.html_to_text(htmlcontent)
        matches = r.get_sorted_claims(text)
        disputes = [dispute for dispute in matches if (dispute[0] > 0)][:4]
        unique = []
        used = set({})
        for dispute in disputes:
            if (not dispute[3] in used) and (not dispute[4] in used):
                used.add(dispute[3])
                used.add(dispute[4])
                unique.append(dispute)
        return unique
    except:
        return []
Exemplo n.º 6
0
def get_raw_disputes(url):
	try:
		htmlcontent = urllib2.urlopen(url,None,2).read(200000)	
		text = h.html_to_text(htmlcontent)
		matches = r.get_sorted_claims(text)
		disputes = [dispute for dispute in matches if (dispute[0] > 0)][:4]
		unique = []
		used = set({})
		for dispute in disputes:
			if (not dispute[3] in used) and (not dispute[4] in used):
				used.add(dispute[3])
				used.add(dispute[4])
				unique.append(dispute)
		return unique		
	except:
		return []
Exemplo n.º 7
0
def get_page_disputes(url,pages):
	try:
		htmlcontent = pages[url]
		text = h.html_to_text(htmlcontent)
		matches = r.get_sorted_claims(text)
		disputes = [dispute for dispute in matches if (dispute[0] > 0)][:4]
		unique = []
		used = set({})
		for dispute in disputes:
			if (not dispute[3] in used) and (not dispute[4] in used):
				used.add(dispute[3])
				used.add(dispute[4])
				unique.append(dispute)
		disputes = [template("disputed_box",dispute = d[1]) for d in unique]
		return " ".join(disputes)
	except:
		return ""
Exemplo n.º 8
0
def get_page_disputes(url, pages=None):
    try:
        if pages:
            htmlcontent = pages[url]
        else:
            htmlcontent = urllib2.urlopen(url, None, 2).read(200000)
        text = h.html_to_text(htmlcontent)
        matches = r.get_sorted_claims(text)
        disputes = [dispute for dispute in matches if (dispute[0] > 0)][:4]
        unique = []
        used = set({})
        for dispute in disputes:
            if (not dispute[3] in used) and (not dispute[4] in used):
                used.add(dispute[3])
                used.add(dispute[4])
                unique.append(dispute)
        disputes = [template("disputed_box", dispute=d[1]) for d in unique]
        return " ".join(disputes)
    except:
        return ""
Exemplo n.º 9
0
def get_prefixes(results,claim):
	abstracts = [html_to_text(result["abstract"]) for result in results]
	prefixes = [prefix_for_claim(abstract,claim) for abstract in abstracts]
	return [prefix for prefix in prefixes if prefix]
Exemplo n.º 10
0
def get_abstracts(results,claim):
	return [html_to_text(result["abstract"]) for result in results]
Exemplo n.º 11
0
def get_context(url,matchtext,before,after):
	html = get_cached_url(url).read()
	textsegments = html_to_text(html)
	i =  textsegments.find(matchtext)
	bigtext = textsegments[max(0,i-before):min(i+after,len(textsegments))]
	return trim_to_words(bigtext)
Exemplo n.º 12
0
def claims_from_html(content):
	text = ht.html_to_text(nt.convert_entities(content))
	return claims_from_body(text)
Exemplo n.º 13
0
def bodys_from_tab_file(f):
	for line in f:
		row = line.strip().split("\t")
		if len(row) > 3:
			yield ht.html_to_text(nt.convert_entities(row[3]))
Exemplo n.º 14
0
def get_prefixes(results, claim):
    abstracts = [html_to_text(result["abstract"]) for result in results]
    prefixes = [prefix_for_claim(abstract, claim) for abstract in abstracts]
    return [prefix for prefix in prefixes if prefix]
Exemplo n.º 15
0
def bodys_from_tab_file(f):
    for line in f:
        row = line.strip().split("\t")
        if len(row) > 3:
            yield ht.html_to_text(nt.convert_entities(row[3]))
Exemplo n.º 16
0
def claims_from_html(content):
    text = ht.html_to_text(nt.convert_entities(content))
    return claims_from_body(text)
Exemplo n.º 17
0
def get_abstracts(results, claim):
    return [html_to_text(result["abstract"]) for result in results]