def get_context(url, matchtext, before, after): html = get_cached_url(url).read() textsegments = html_to_text(html) i = textsegments.find(matchtext) bigtext = textsegments[max(0, i - before):min(i + after, len(textsegments))] return trim_to_words(bigtext)
def get_raw_disputes(url): """unfiltered and unranked. Return all disputes we find""" try: htmlcontent = urllib2.urlopen(url, None, 2).read(200000) text = html_to_text(htmlcontent) return match_with_claims(text) except: return []
def get_dispute_context(claimtext): try: contextobj = ClaimContext.objects.filter(claimtext=claimtext)[0] text = html_to_text(contextobj.sentence).strip() return {'url':contextobj.url, 'text':text, 'prefix': contextobj.prefix, 'date':contextobj.date, 'badvotes':contextobj.badvotes, 'goodvotes':contextobj.goodvotes} except: return {'url':'',"text":'','prefix':'','date':'','badvotes':0,'goodvotes':0}
def get_raw_disputes(url): """unfiltered and unranked. Return all disputes we find""" try: htmlcontent = urllib2.urlopen(url,None,2).read(200000) text = html_to_text(htmlcontent) return match_with_claims(text) except: return []
def get_raw_disputes(url): try: htmlcontent = urllib2.urlopen(url, None, 2).read(200000) text = h.html_to_text(htmlcontent) matches = r.get_sorted_claims(text) disputes = [dispute for dispute in matches if (dispute[0] > 0)][:4] unique = [] used = set({}) for dispute in disputes: if (not dispute[3] in used) and (not dispute[4] in used): used.add(dispute[3]) used.add(dispute[4]) unique.append(dispute) return unique except: return []
def get_raw_disputes(url): try: htmlcontent = urllib2.urlopen(url,None,2).read(200000) text = h.html_to_text(htmlcontent) matches = r.get_sorted_claims(text) disputes = [dispute for dispute in matches if (dispute[0] > 0)][:4] unique = [] used = set({}) for dispute in disputes: if (not dispute[3] in used) and (not dispute[4] in used): used.add(dispute[3]) used.add(dispute[4]) unique.append(dispute) return unique except: return []
def get_page_disputes(url,pages): try: htmlcontent = pages[url] text = h.html_to_text(htmlcontent) matches = r.get_sorted_claims(text) disputes = [dispute for dispute in matches if (dispute[0] > 0)][:4] unique = [] used = set({}) for dispute in disputes: if (not dispute[3] in used) and (not dispute[4] in used): used.add(dispute[3]) used.add(dispute[4]) unique.append(dispute) disputes = [template("disputed_box",dispute = d[1]) for d in unique] return " ".join(disputes) except: return ""
def get_page_disputes(url, pages=None): try: if pages: htmlcontent = pages[url] else: htmlcontent = urllib2.urlopen(url, None, 2).read(200000) text = h.html_to_text(htmlcontent) matches = r.get_sorted_claims(text) disputes = [dispute for dispute in matches if (dispute[0] > 0)][:4] unique = [] used = set({}) for dispute in disputes: if (not dispute[3] in used) and (not dispute[4] in used): used.add(dispute[3]) used.add(dispute[4]) unique.append(dispute) disputes = [template("disputed_box", dispute=d[1]) for d in unique] return " ".join(disputes) except: return ""
def get_prefixes(results,claim): abstracts = [html_to_text(result["abstract"]) for result in results] prefixes = [prefix_for_claim(abstract,claim) for abstract in abstracts] return [prefix for prefix in prefixes if prefix]
def get_abstracts(results,claim): return [html_to_text(result["abstract"]) for result in results]
def get_context(url,matchtext,before,after): html = get_cached_url(url).read() textsegments = html_to_text(html) i = textsegments.find(matchtext) bigtext = textsegments[max(0,i-before):min(i+after,len(textsegments))] return trim_to_words(bigtext)
def claims_from_html(content): text = ht.html_to_text(nt.convert_entities(content)) return claims_from_body(text)
def bodys_from_tab_file(f): for line in f: row = line.strip().split("\t") if len(row) > 3: yield ht.html_to_text(nt.convert_entities(row[3]))
def get_prefixes(results, claim): abstracts = [html_to_text(result["abstract"]) for result in results] prefixes = [prefix_for_claim(abstract, claim) for abstract in abstracts] return [prefix for prefix in prefixes if prefix]
def get_abstracts(results, claim): return [html_to_text(result["abstract"]) for result in results]