def get_paragraphs_JT(str_text, mode, file_name=''): """ using Justext """ if mode == "_english": stop = justext.get_stoplist("English") elif mode == 'lang_detect': lang = get_langid(str_text) if lang == "Chinese": stop = set() else: stop = justext.get_stoplist(lang) # mode ou on détecte la 'vraie' langue fournie par le fichier doc_lg.json elif mode == 'lang_specified' and file_name != '': with open(DOC_LG_PATH, mode='r', encoding='utf-8', errors='ignore') as lang_code_file: json_data = json.load( lang_code_file) # on charge nos codes de langue lang = json_data[file_name] # on récupère la langue if lang == "Chinese": stop = set() else: stop = justext.get_stoplist(lang) lang_code_file.close() else: stop = frozenset() if len(stop) == 0: any_lang_stop_words = get_all_stop_words() paragraphs = justext.justext(str_text, any_lang_stop_words) else: paragraphs = justext.justext(str_text, stop) list_paragraphs = [x.text for x in paragraphs if not x.is_boilerplate] return list_paragraphs
def apply_justext_boilerplate_stripper(self, r, stoplist): index_key = "index_{}_{}_{}".format(r["country"], r["website"], r["feed_name"]) index_key = "{}.csv".format(self.escape_filename(index_key)) w = self.escape_filename(r["website"]) feed_name = self.escape_filename(r["feed_name"]) original_html_path = os.path.join(self.raw_dir, w, feed_name, r["original_html_file"]) xml_dir = os.path.join(self.proc_dir, w, feed_name) try: os.makedirs(xml_dir) except IOError: pass processed_xml_path = os.path.join(xml_dir, r["original_html_file"].replace(".html", ".xml")) try: with open(original_html_path, "r", encoding="utf-8") as h: text = h.read() except FileNotFoundError: text = None self.index_df[index_key].loc[r.name, "downloaded"] = False self.index_df[index_key].loc[r.name, "processed"] = False self.index_df[index_key].loc[r.name, "justext_comment"] = np.nan if text: try: paragraphs = justext.justext(text, justext.get_stoplist("English")) except ValueError: # e.g. if unable to get stoplist in pyinstaller compiled version paragraphs = justext.justext(text, stoplist=stoplist) to_keep = [] bp_count = 0 for paragraph in paragraphs: if not paragraph.is_boilerplate: to_keep.append(paragraph) else: bp_count += 1 if to_keep: root = etree.Element("text") tree = etree.ElementTree(root) for paragraph in to_keep: p_elem = etree.Element("p") p_elem.text = paragraph.text root.append(p_elem) xml_str = etree.tounicode(tree) try: tree.write(processed_xml_path, pretty_print=True, encoding='utf-8', xml_declaration=True) except IOError as e: print("WARNING: Could not write XML file:", e) self.index_df[index_key].loc[r.name, "processed"] = False else: self.index_df[index_key].loc[r.name, "processed"] = True else: print("WARNING: No non-boilerplate code found for", original_html_path) self.index_df[index_key].loc[r.name, "justext_comment"] = "{}/{}".format(len(to_keep), bp_count) self.index_df[index_key].loc[r.name, "extraction_method"] = "jusText"
def getTextFromWeb(self): num_results = 10 search_list = ["bbc", "Little Red Riding Hood"] sites = [] text = [] results = [] while len(search_list)!=0 and len(results) < num_results: search = search_list.pop() results = results + google.google(search,nltk.word_tokenize) for d in results: sites.append(d) if len(sites) == num_results: break for url in sites: print url try: page = urllib2.urlopen(url).read() except urllib2.HTTPError, e: print "Search failed: %s" % e continue paragraphs = justext.justext(page, justext.get_stoplist('English')) if len(text) < 50: for paragraph in paragraphs: if paragraph['class'] == 'good' and len(text) < 50: sentences = self.segment_sentences(paragraph['text'].encode('utf8')) for s in sentences: if not text.__contains__(s): text.append(s)
def get_document_text(input_url_response): DOCUMENT_LENGTH = 0 paragraphs = justext.justext(input_url_response.content, justext.get_stoplist("English")) for paragraph in paragraphs: DOCUMENT_LENGTH += len(paragraph.text) return DOCUMENT_LENGTH
def try_justext(tree, url, target_language): '''Second safety net: try with the generic algorithm justext''' result_body = etree.Element('body') justtextstring = html.tostring(tree, pretty_print=False, encoding='utf-8') # determine language if target_language is not None and target_language in JUSTEXT_LANGUAGES: langsetting = JUSTEXT_LANGUAGES[target_language] justext_stoplist = justext.get_stoplist(langsetting) else: #justext_stoplist = justext.get_stoplist(JUSTEXT_DEFAULT) justext_stoplist = JT_STOPLIST # extract try: paragraphs = justext.justext(justtextstring, justext_stoplist, 50, 200, 0.1, 0.2, 0.2, 200, True) except ValueError as err: # not an XML element: HtmlComment LOGGER.error('justext %s %s', err, url) result_body = None else: for paragraph in [p for p in paragraphs if not p.is_boilerplate]: #if duplicate_test(paragraph) is not True: elem = etree.Element('p') elem.text = paragraph.text result_body.append(elem) return result_body
def remove_boilerplate(page_str, lang, relaxed=False): """ Removes boilerplate from HTML documents. Uses JusText library. NOTE: quality dependent on correct language detection. :param page_str: str HTML page source. :param lang: str Google Translate language code. :param relaxed: boolean If True the span between the first and last good/near-good boilerplate match is returned. Short and bad segments in between are kept. :return: list List of non-boilerplate segments/paragraphs. """ if lang not in GTRANS_JUSTEXT_LANG_MAP: #raise AttributeError("Can not remove boilerplate for language code lang='%s'." % lang) return [] jt_lang = GTRANS_JUSTEXT_LANG_MAP[lang] paragraphs = justext.justext(page_str, justext.get_stoplist(jt_lang)) if relaxed: good_indexes = [paragraphs.index(p) for p in paragraphs if p.class_type in ['near-good', 'good']] if len(good_indexes) == 0: return [] return [paragraph.text for paragraph in paragraphs[min(good_indexes):max(good_indexes) + 1]] else: return [paragraph.text for paragraph in paragraphs if paragraph.class_type in ['near-good', 'good', 'short']]
def get_article(articles, i, output): for article in tqdm(articles): try: a = newspaper.Article(article) a.download() a.parse() a.nlp() paragraphs = justext.justext(a.html, justext.get_stoplist("English")) text = '\n\n'.join( [p.text for p in paragraphs if not p.is_boilerplate]) if (len(text) > len(a.text) + 50): a.set_text(text) h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True a.set_html(h.handle(a.html)) except Exception as e: print(e) continue # TODO: config option? if len(a.text) < 400: continue output.append(a)
def get_url_article2(link, lang): ''' TO BE DONE : error handling : http://www.voidspace.org.uk/python/articles/urllib2.shtml#handling-exceptions ''' ### bug encodage if len(link) < 5: return False try: #l = link.decode("utf-8", errors='ignore') log.info("Retrieving : " + link) #hdr = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7' hdr = 'Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:10.0) Gecko/20100101 Firefox/10.0' headers = {'User-Agent': hdr} resp = requests.get(link, headers=headers) resp.raise_for_status() page = resp.text #log.info(page) contents = '' #print(justext.get_stoplist()) paragraphs = justext.justext(page, justext.get_stoplist(lang)) for paragraph in paragraphs: if paragraph.class_type == 'good': #and re.search(r'Facebook connect|cliquez|Envoyer cet article par email|D.couvrez tous nos packs|d.j.un|recevoirnos|nosoffres|acc.dezà|cliquez ici|En poursuivant votre navigation sur ce site|accédezà|pasencore|Veuillez cliquer|créez gratuitement votre compte]',paragraph.text)== None: contents = contents + "\n" + paragraph.text cts = remove_control_characters(contents) if len(cts) == 0: log.warning("No contents for :" + link) # + " " + page return cts except requests.exceptions.RequestException as e: log.warning("Exception : " + str(e)) return False
def simple_text_extractor(html, stopwords='English'): import corpkit """extract text from html/xml files using justext""" import requests import justext import os import copy # if on hard disk: if type(html) != list: html_files = [copy.deepcopy(html)] else: html_files = copy.deepcopy(html) output = [] for html in html_files: if os.path.isfile(html): f = open(html) raw_html_text = f.read() # if it's a web address elif html.startswith('http'): response = requests.get(html) raw_html_text = response.content # if it's already html text: else: raw_html_text = copy.deepcopy(html) paragraphs = justext.justext(raw_html_text, justext.get_stoplist(stopwords)) text = [] for paragraph in paragraphs: if not paragraph.is_boilerplate: text.append(paragraph.text) text = '\n'.join(text) metadata = os.path.basename(html) tup = (text, metadata) output.append(tup) return output
def fetch(keyword, url, rank, articles, totalNumber): searchKeywords = keyword.split('" OR "') # We are going to check the article text for our keywords after being run through JusText response = requests.get(url) paragraphs = justext.justext(response.text, justext.get_stoplist("English")) empty = True containsKeyword = False minMentions = 3 mentions = 0 searchKeyword = searchKeywords[0].replace('"', '').strip().split(' ', 1)[0] # Get the first word of the search term articleParagraphs = [] for paragraph in paragraphs: if not paragraph.is_boilerplate: if searchKeyword in paragraph.text: mentions += 1 #paragraph.text.count(searchKeyword) articleParagraphs.append(paragraph.text) if (mentions < minMentions): #print("A website (" + url + ") did not have the keyword enough times! Removed.") return '''for searchKeyword in searchKeywords: searchKeyword = searchKeyword.replace('"', '').strip().split(' ', 1)[0] # Get the first word of the search term if searchKeyword in article: containsKeyword = True break if (containsKeyword == False): print("A website (" + url + ") does not contain the keyword! Removed.") return ''' articles.append(Article.Article(articleParagraphs, url, rank)) print("\r" + str(len(articles)) + " / " + str(totalNumber) + " articles crawled to for keyword " + keyword, end=' ') sys.stdout.flush()
def get_document(url): ''' This function will check if the url is valid and then proceed to parse it to produce a clean text (no html) which can be used as input to a recommendation engine. Arguments: url -- input url that needs to be checked and parsed ''' try: r = requests.head(url, allow_redirects=True) except requests.exceptions.ConnectionError as e: raise URLRetrievalError(url, 'Could not connect', e) if r.status_code != requests.codes.ok: raise URLRetrievalError( url, 'Invalid response code from remote server: {}'.format( r.status_code)) if r.headers["content-type"].split(';')[0] not in [ "text/html", "text/plain" ]: raise URLRetrievalError( url, 'Document has invalid MIME type: {}'.format( r.headers["content-type"])) raw = requests.get(url) paragraphs = justext.justext(raw.content, justext.get_stoplist("English")) text_only = '' for paragraph in paragraphs: if not paragraph.is_boilerplate: text_only += ' ' + paragraph.text if len(text_only) == 0: raise DocumentParsingError('Length of document is zero') return text_only
def get_text(html): paragraphs = justext.justext(html, justext.get_stoplist('English')) text = "" for paragraph in paragraphs: if not paragraph.is_boilerplate: # and not paragraph.is_header: text = text + paragraph.text + ". " return text
def getTextFromWeb(self): num_results = 10 search_list = ["bbc", "Little Red Riding Hood"] sites = [] text = [] results = [] while len(search_list) != 0 and len(results) < num_results: search = search_list.pop() results = results + google.google(search, nltk.word_tokenize) for d in results: sites.append(d) if len(sites) == num_results: break for url in sites: print url try: page = urllib2.urlopen(url).read() except urllib2.HTTPError, e: print "Search failed: %s" % e continue paragraphs = justext.justext(page, justext.get_stoplist('English')) if len(text) < 50: for paragraph in paragraphs: if paragraph['class'] == 'good' and len(text) < 50: sentences = self.segment_sentences( paragraph['text'].encode('utf8')) for s in sentences: if not text.__contains__(s): text.append(s)
def __get_keywords(file, bnc_frequencies, keyword_dict={}, ignore_capitalized=False): f = codecs.open(file, "r", encoding="utf-8").read() paragraphs = justext.justext(f, justext.get_stoplist("English")) freqs = {} text_freqs = {} for paragraph in paragraphs: if not paragraph.is_boilerplate: tokens = nltk.word_tokenize(clean_text(paragraph.text, not ignore_capitalized)) for token in tokens: if ignore_capitalized and token != token.lower(): continue if token not in text_freqs: text_freqs[token] = 0 if token in freqs: text_freqs[token] += 1 continue elif token in bnc_frequencies: freqs[token] = bnc_frequencies[token] text_freqs[token] += 1 else: freqs[token] = 0 text_freqs[token] += 1 for f_key, f_value in text_freqs.iteritems(): if f_value < 2: del freqs[f_key] x = len(freqs.keys())/10 for i in range(x): min_word = min(freqs, key=freqs.get) if min_word not in keyword_dict: keyword_dict[min_word] = 0 keyword_dict[min_word] += text_freqs[min_word] del freqs[min_word]
def content(self): """ :return: Text content of the given document """ try: from os import path if path.isfile(self.document_location): import codecs with codecs.open(self.document_location, 'r', 'utf-8') as input_document: content = input_document.read() text = justext.justext(content, justext.get_stoplist("English")) res = [] # total_length = 0 for paragraph in text: if not paragraph.is_boilerplate: res.append(paragraph.text) # total_length += len(paragraph.text) # if total_length > 10000: # break res = '\n'.join(res) return res # return extract_text(content) else: logger.warning("Document not found: " + str(self.document_location)) except Exception as exc: logger.warning(exc) return ""
def try_justext(tree, filecontent, record_id): '''safety net: try with justext''' result_body = etree.Element('body') justtextstring = html.tostring(tree, pretty_print=False, encoding='unicode') LOGGER.info('raw length: %s (file) %s (tostring) ', len(filecontent), len(justtextstring)) try: # paragraphs = custom_justext(tree) paragraphs = justext.justext(justtextstring, JUSTEXT_STOPLIST) except ValueError as err: # ValueError: Input object is not an XML element: HtmlComment LOGGER.error('justext %s %s', err, record_id) return None for paragraph in paragraphs: if not paragraph.is_boilerplate: # if lrutest.has_key(paragraph.text) is False or lrutest[paragraph.text] <= 2: if duplicate_test(paragraph, justext_switch=True) is not True: elem = etree.Element('p') elem.text = paragraph.text result_body.append(elem) # jt += paragraph.text + '</p><p>' # jt += '</p>' # temp_jt = u' '.join(jt.itertext()) # temp_jt = jt return result_body
def get_text(link): response = requests.get(link) print(response) paragraphs = justext.justext(response.content, justext.get_stoplist("English")) text = "\n\n".join([p.text for p in paragraphs if not p.is_boilerplate]) return text
def overallSentiment(urls, verbose=False): """ Guesses the overall sentiment of the given articles :param urls: List of URLs of articles to read :param verbose: Print status updates and specific verdicts :return: The proportion of articles that are positive """ sentiments = [] for url in urls: try: if verbose: print "Downloading", url + "..." response = requests.get(url) paragraphs = justext.justext(response.content, justext.get_stoplist("English")) allText = "\n".join([paragraph.text for paragraph in paragraphs]) if verbose: print "Reading..." sentiment = guessSentiment(allText) if verbose: print "Verdict:", sentiment sentiments.append(sentiment) except: if verbose: print "Failed to download", url positiveCount = len(filter(lambda x: x == "Positive", sentiments)) return float(positiveCount) / len(urls)
def simple_text_extractor(html, stopwords = 'English'): import corpkit """extract text from html/xml files using justext""" import requests import justext import os import copy # if on hard disk: if type(html) != list: html_files = [copy.deepcopy(html)] else: html_files = copy.deepcopy(html) output = [] for html in html_files: if os.path.isfile(html): f = open(html) raw_html_text = f.read() # if it's a web address elif html.startswith('http'): response = requests.get(html) raw_html_text = response.content # if it's already html text: else: raw_html_text = copy.deepcopy(html) paragraphs = justext.justext(raw_html_text, justext.get_stoplist(stopwords)) text = [] for paragraph in paragraphs: if not paragraph.is_boilerplate: text.append(paragraph.text) text = '\n'.join(text) metadata = os.path.basename(html) tup = (text, metadata) output.append(tup) return output
def get_document(url): ''' This function will check if the url is valid and then proceed to parse it to produce a clean text (no html) which can be used as input to a recommendation engine. Arguments: url -- input url that needs to be checked and parsed ''' try: r = requests.head(url, allow_redirects = True) except requests.exceptions.ConnectionError as e: raise URLRetrievalError(url, 'Could not connect', e) if r.status_code != requests.codes.ok: raise URLRetrievalError(url, 'Invalid response code from remote server: {}' .format(r.status_code)) if r.headers["content-type"].split(';')[0] not in ["text/html", "text/plain"]: raise URLRetrievalError(url, 'Document has invalid MIME type: {}' .format(r.headers["content-type"])) raw = requests.get(url) paragraphs = justext.justext(raw.content, justext.get_stoplist("English")) text_only = '' for paragraph in paragraphs: if not paragraph.is_boilerplate: text_only += ' ' + paragraph.text if len(text_only) == 0: raise DocumentParsingError('Length of document is zero') return text_only
def get_output_justext(input_data): result = [] paragraphs = justext.justext(input_data, stoplist='English') for paragraph in paragraphs: result.append(paragraph.text) return "\n".join(result)
def __init__(self, url): np_extract = Article(url) np_extract.download() if np_extract.download_state == 2: try: np_extract.parse() np_text = np_extract.text except: np_text = '' else: np_text = '' jt_text = '' try: response = requests.get(url) paragraphs = justext.justext(response.content, justext.get_stoplist("English")) for paragraph in paragraphs: if not paragraph.is_boilerplate: jt_text = jt_text + str(paragraph.text) except: jt_text = '' if len(np_text) > len(jt_text): self.text = np_text else: self.text = jt_text self.original_title = np_extract.title self.tok = nltk.word_tokenize(self.text) self.img = list(np_extract.images) self.vid = list(np_extract.movies) self.url = url self.nchar = len(self.text) self.nword = len(self.tok)
def extract_paragraphs(content, is_html): if is_html: return [ p.text for p in justext.justext(content.encode('utf-8'), hr_stoplist, encoding='utf-8') if not p.dom_path.endswith('pre') ] return [p.strip() for p in content.split('\n')]
def get_text_without_boilerplate(htmlcontent): # htmlcontent = htmlcontent.replace('\n', ' ') try: paragraphs = justext(htmlcontent, get_stoplist("English")) except Exception as e: raise SnippetGenerationError( "failed to process document using justext", original_exception=e) allparatext = "" for paragraph in paragraphs: try: if not paragraph.is_boilerplate: allparatext += " {}".format(paragraph.text) except Exception as e: raise SnippetGenerationError( "failed to process document using justext", original_exception=e) if allparatext == "": for paragraph in paragraphs: try: allparatext += "{}".format(paragraph.text) except Exception as e: raise SnippetGenerationError( "failed to process document using justext", original_exception=e) return allparatext
def boilerplate_remove(inp_text, stopwordlist, entry_str): warc1, warc2, text = inp_text.split(b'\r\n\r\n', maxsplit=2) warc1 = warc1.decode('UTF-8').replace('\r\n', '\n') warc2 = warc2.decode('UTF-8').replace('\r\n', '\n') length = len(text) if length <= 13: # Threshold minimum: '<html></html>' is 13 long skip_action(warc1, warc2, 'LengthError({0})'.format(length), entry_str) return None try: paragraphs = justext.justext(text, stopwordlist) # TypeError JusText bug, AssertionError, ValueError JusText bug on comment... except (ParserError, UnicodeDecodeError, TypeError, AssertionError, ValueError) as err: # Do not distinguish between the different errors skip_action(warc1, warc2, err.__class__.__name__ + str(length), entry_str) return None # Escape paragraph for parsable XML text_removed = '\n\n'.join( ('<p>\n{0}\n</p>'.format(xml.sax.saxutils.escape(paragraph.text)) for paragraph in paragraphs if not paragraph.is_boilerplate)) if len(text_removed) == 0: skip_action(warc1, warc2, 'JusTextBadError({0})'.format(length), entry_str) return None filename, domain, url, warc_file, offset_str, length_str, response, mime_type = entry_str.split( ' ', maxsplit=8) filename = filename.replace('.gz', '') return '<doc domain="{0}" index="{1}" url="{2}" warc-file="{3}" offset="{4}" length="{5}" response="{6}"' \ ' mime-type="{7}">\n<meta>\n<request>\n{8}\n</request>\n' \ '<response>\n{9}\n</response>\n</meta>\n{10}\n</doc>\n\n\n'.\ format(domain, filename, url, warc_file, offset_str, length_str, response, mime_type, warc1, warc2, text_removed).encode('UTF-8')
def process(record): response = requests.get(record['WARC-Target-URI']) first = True if response.text: paragraphs = justext.justext(response.content, justext.get_stoplist("English")) heading = "" body = "" for paragraph in paragraphs: if first and paragraph.is_heading: #words = filter(lambda word: not word in stopword_set, paragraph.text.split()) #heading = (' ').join(words) heading = paragraph.text first = False elif not paragraph.is_boilerplate and paragraph.class_type == 'good': #words = filter(lambda word: not word in stopword_set, paragraph.text.split()) #body += (' ').join(words) body += " " + paragraph.text if body != "": body = body.replace('"', "---") body = body.replace('\n', "") #records.append({"URL":record['WARC-Target-URI'], "Title":heading, "Sentences": body}) file.write(("{\"URL\":\"" + record['WARC-Target-URI'] + "\",\"Title\":\"" + heading + "\",\"Sentences\":\"" + body + "\"").encode('utf-8').strip()) file.write('\n')
def documents( self, doc_id_start = 1, max_doc_id = None ): if max_doc_id == None: max_doc_idx = self.count_doc() for i in range( doc_id_start, max_doc_idx ): doc = self.doc_retrieve( i ) if not doc.mime_type() == Wire.MIME_TEXT_HTML: continue text = self.retrieve_text_by_docid(i) try: paragraphs = justext.justext(text, self.stopwords) except lxml.etree.XMLSyntaxError: #print idx.url_by_docid(i), "bad html" continue except lxml.etree.ParserError: #print idx.url_by_docid(i), "bad html" continue except TypeError: #print idx.url_by_docid(i), "caused error" continue good_text = filter( lambda x: x['class'] == 'good', paragraphs ) if not good_text: continue content = [ unescape(p['text']) for p in good_text ] soup = BeautifulSoup( text ) title_node = soup.find('title') if title_node: title = unescape( title_node.getText().rstrip().lstrip() ) if not title_node: title = '' meta_nodes = soup.findAll('meta') description = '' for m in meta_nodes: try: if m['name'] == 'description' and m['content']: description = m['content'] break except KeyError: continue #print meta_nodes ''' if meta_nodes: description = meta_nodes[0]['content'] else: description = '' ''' #description = '' url = self.url_by_docid(i).decode('ascii', 'ignore') site = url.split('/')[0] doc_data = { 'title': unicode(title), 'url': unicode(url), 'site': site, 'content': content, 'description': description } yield doc_data
def get_article(item, source, reprocess=False): """Take the initial set of listings and enrich the content.""" article = dict() encoded = item.get('link').encode('utf-8') article['uuid'] = hashlib.sha256(encoded).hexdigest() processed = is_found(article['uuid']) if processed and not reprocess: return {'article': processed, 'from_store': True} article['title'] = item.get('title', None) href = item.get('link', None) article['href'] = strip_google(href) article['source'] = derive_source(article['href']) article['collected'] = now_time() article['published'] = item.get('published', None) article['summary'] = item.get('summary', None) page_content = get_page_content(article['href']) if not page_content: logger.debug("No content found: %s" % article['href']) return {'article': None, 'from_store': True} paragraphs = justext.justext(page_content, justext.get_stoplist("English"), no_headings=True, max_heading_distance=150, length_high=140, max_link_density=0.4, stopwords_low=0.2, stopwords_high=0.3) text_content = list() for paragraph in paragraphs: if paragraph.is_boilerplate: continue text_content.append(paragraph.text) text_content = '\n'.join(text_content) tokens = get_tokens(text_content) article['word_count'] = len(tokens) article['read_time'] = round(float(article['word_count']) / 250, 2) clean = cleaned_tokens(tokens) article['tokens'] = [{ t[0]: t[1] } for t in nltk.FreqDist(clean).most_common(100)] article['tags'] = [list(x.keys())[0] for x in article['tokens'][0:7]] article['sentiment'] = get_sentiment(text_content) article['feed_source'] = source.replace('www.google.com', 'google.com') articles = mongo.db[app.config['ARTICLES_COLLECTION']] if not reprocess: try: articles.insert(article) except: pass else: if not processed: try: articles.insert(article) except: pass articles.update({'_id': ObjectId(processed['_id'])}, {'$set': article}) return {'article': article, 'from_store': False}
def remove_boilerplate(html, language="English"): try: paragraphs = justext.justext(html, justext.get_stoplist(language)) except: return html # TODO alternative to justext tag = lambda p: ("%s\n----\n" if p.is_heading else "%s\n\n") % p.text content = "".join([tag(p) for p in paragraphs if not p.is_boilerplate]) return content
def html_to_text_justext(html_content_in_byte): paragraphs = justext.justext(html_content_in_byte, justext.get_stoplist("English")) boilerplate_free = [ paragraph.text for paragraph in paragraphs if not paragraph.is_boilerplate ] return "".join(boilerplate_free)
def remove_boilerplate(self, text): """ Removes website artifacts: "Skip to Main Content", "About Us", etc. """ jtext = justext.justext(text, justext.get_stoplist("English")) cleaned = [line.text for line in jtext if not line.is_boilerplate] cleaned_text = " ".join(cleaned) if cleaned else "" return cleaned_text
def webScraper(url): response = requests.get(url) paragraphs = justext.justext(response.content, justext.get_stoplist('English')) returningParagraphs = list() for item in paragraphs: returningParagraphs.append(item.text) return (returningParagraphs)
def get_doc_contents(filepath): contents = bytearray() with open(filepath,'rb') as f: paragraphs = justext.justext(f.read(), justext.get_stoplist('English')) for para in paragraphs: if not para.is_boilerplate: contents.extend(para.text.encode('UTF8')) return cleanup(str(contents)) # LIST OF CLEANED TOKENS
def getCorpus(html, stopwords, lmin, lmax): full_text = [] paragraphs = justext.justext(html, stopwords, lmin, lmax) for paragraph in paragraphs: if paragraph.cf_class == 'good': real_text = ''.join("%s" % i.encode('utf-8') for i in paragraph.text_nodes) full_text.append(real_text) return ' '.join(full_text)
def get_doc_contents(filepath): contents = bytearray() with open(filepath, 'rb') as f: paragraphs = justext.justext(f.read(), justext.get_stoplist('English')) for para in paragraphs: if not para.is_boilerplate: contents.extend(para.text.encode('UTF8')) return cleanup(str(contents)) # LIST OF CLEANED TOKENS
def read_files(path, file_name, langue): contenu = codecs.open(path + file_name,'r',encoding='utf-8').read() paragraphs = justext.justext(contenu, justext.get_stoplist(langue)) chaine = "" for paragraph in paragraphs: if not paragraph.is_boilerplate: chaine+= paragraph.text+"\n" return chaine
def cleanHtml(html): # raw = nltk.clean_html(html) // was removed in nltk 3.0 # If you do not install justext, use beautifulsoup: # soup = BeautifulSoup(html) # raw = soup.get_text() # This will do a better job once you install justext paragraphs = justext.justext(html, justext.get_stoplist('English')) return "\n".join([p.text for p in paragraphs if not p.is_boilerplate])
def get_content(self, html): # I should refactor the other get_content when this fails into here lang_mapping = {'nl': 'Dutch', 'en': 'English', 'com': 'English'} if self.detected_language not in lang_mapping: return '' lang = lang_mapping[self.detected_language] body_content = [x.text for x in justext.justext(html, justext.get_stoplist(lang)) if not x.is_boilerplate and not x.is_heading] return body_content
def run_justext(htmlstring): '''try with the generic algorithm justext''' valid = list() paragraphs = justext.justext(htmlstring, justext.get_stoplist("German")) , 50, 200, 0.1, 0.2, 0.2, 200, True) # stop_words for paragraph in paragraphs: if not paragraph.is_boilerplate: valid.append(paragraph.text) result = ' '.join(valid) return result # sanitize(result)
def jt_treatement(input_file, output_file): """ Defines the specific JusText treatment to perform from the input file to the output file. """ paragraphs = justext.justext(input_file.read(), justext.get_stoplist('English')) for paragraph in paragraphs: output_file.write("<p>" + paragraph.text.replace("\n", " ") + "</p>\n")
def run_justext(htmlstring): '''try with the generic algorithm justext''' valid = list() paragraphs = justext.justext(htmlstring, justext.get_stoplist("German")) for paragraph in paragraphs: if not paragraph.is_boilerplate: valid.append(paragraph.text) result = ' '.join(valid) return result
def parseHtmlToText(htmlContent): try: justextContent = justext.justext(htmlContent.encode("utf-8"), justext.get_stoplist('Estonian')) # text = getText(getParagraphs(justextContent)) except Exception: justextContent = "" text = getText(getParagraphs(justextContent)) #logger.info("Text length:"+len(text)) return text
def read_dial(self): response = requests.get( f'https://pidru4niki.com/15780506/filosofiya/osnovni_zakoni_dialektiki_svitoglyadne_metodologichne_znachennya' ) paragraphs = justext.justext(response.content, justext.get_stoplist("Ukrainian")) prs = [pp for pp in paragraphs if not pp.is_boilerplate] chosen_p = random.choice(list(prs)) self.speaker.tell_ua(chosen_p.text)
def text(self): if not self._text: if self._article.is_valid_body(): self._text = self._article.text else: self._text = '\n'.join(p.text for p in justext.justext( self._article.html, justext.get_stoplist("English"))) return self._text
def toJustText(webContent): print 'Entree dans toJustText' txt='' paragraphs = justext.justext(webContent, justext.get_stoplist("English")) for paragraph in paragraphs: #if not paragraph.is_boilerplate: txt+= smart_str(paragraph.text.encode('utf-8')) return txt
def clean_jusText_localFile(filename, language, outputfile) : try : with codecs.open(filename, "r", "utf-8") as f: with open(outputfile, "w") as output: content = f.read() paragraphs = justext.justext(content, justext.get_stoplist(CODE_LANG[language])) for paragraph in paragraphs: if not paragraph.is_boilerplate: output.write(paragraph.text.encode('utf-8')+"\n") except ValueError : print "[jusText] Stopwords list not available for "+language
def getText(self): text = '' try: response = requests.get(JusTextWrapper.iUrl) paragraphs = justext.justext(response.content, justext.get_stoplist("English")) for paragraph in paragraphs: if not paragraph.is_boilerplate: text += " "+paragraph.text return text except: return ""
def text(request): page = urllib2.urlopen(request.GET.get('url','')).read() paragraphs = justext.justext(page, justext.get_stoplist('English')) text = [] for paragraph in paragraphs: if paragraph['class'] == 'good': p = {} p['content'] = paragraph['text'] p['heading'] = paragraph['heading'] text.append(p) return HttpResponse(simplejson.dumps(text), 'application/json')
def get_url(webpage): doctext = bytearray() try: response = requests.get(webpage) except requests.exceptions.MissingSchema: webpage = 'http://' + webpage response = requests.get(webpage) paragraphs = justext.justext(response.content, justext.get_stoplist('English')) for para in paragraphs: if not para.is_boilerplate: doctext.extend(para.text.encode('UTF-8')) return cleanup(str(doctext))
def crawl_url(self, url): content = Content('','') try: request = urllib2.Request(url) page = urllib2.urlopen(request).read() if page: paragraphs = justext.justext(page, [], stopwords_high=0, stopwords_low = 0, length_low=LENGTH_LOW_DEFAULT) text = [para.text for para in paragraphs if not para.is_boilerplate] content = Content(url, '\n'.join(text)) except Exception as e: pass return content
def get_text_from_reuters(link): response = requests.get(link) resText = response.content.decode("UTF-8", 'ignore') soup = BeautifulSoup(resText, 'html.parser') tmp = [x.extract() for x in soup.find_all(class_= "Edition_items_293of")] for tag in soup.find_all(["script", "meta", "head", "style", "noscript"]): tag.decompose() for tag in soup.find_all(True, class_= ["Attribution_content_27_rw", "Image_container_1tVQo"]): tag.decompose() paragraphs = justext.justext(soup.prettify(), justext.get_stoplist("English")) text = "\n\n".join([p.text for p in paragraphs if not p.is_boilerplate]) return text
def solo_texto(self): ''' Proviene de trae_datos donde pasa a self.texto Trae el contenido de cada url individual (cada noticia) por urllib.request.urlopen() y BeautifulSoup ''' if self.link: con_ac = 'áéíóúüñ' sin_ac = 'aeiouun' conv = str.maketrans(con_ac, sin_ac) self.link = self.link.translate(conv) texto = '' try: user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7' headers={'User-Agent':user_agent,} req = urllib.request.Request(self.link, None, headers) art1 = urllib.request.urlopen(req) # signal.alarm(0) except: print("<a href = '"+self.link+"'>Sin conexion (solo_texto) al link</a>") return False art2 = art1.read() art1.close() try: #metas = parseString(art2) #print(1) try: paras = justext.justext(art2, justext.get_stoplist('Spanish')) except: print("Error en justext") for para in paras: if para['class'] == 'good': parra = para['text'] parra = self.cambia_acentos(parra, self.acentos) parra = parra.replace('Ã', 'Ó') if parra.endswith('.'): texto += " " + parra else: texto += " " +parra + "." if not texto: print("<a href='"+self.link+"'>No hay texto recibido en trae_articulo" + self.fuente+"</a>") else: self.articulo = bs(art2) #print(2) if (self.articulo): self.busca_fotos() #print(3) return texto except: print("<a href = '"+self.link+"'>Errores en justext para link </a>") return False
def scrape(url, title): text = str() try: page = requests.get(url) paragraphs = justext.justext(page.content, justext.get_stoplist('English')) for par in paragraphs: if par['class'] == 'good': text += par['text'] return text #Generic error catching is bad #As are printed log statements.... except Exception: print 'Something went wrong...'
def fetch(url): print 'Fetching: %s' % url if not redis_client.sismember('htmlcache:fetched', url): naked_url = protocol_re.sub('', url) long_filename = nonword_re.sub('-', naked_url) filename = long_filename[:255] html = requests.get(url, timeout=10).text text = u'\n'.join(p['text'] for p in justext.justext(html, stopwords) if p['class'] == 'good') with open('%s/%s' % (opts.directory, filename), 'w') as fp: fp.write(text.encode('utf8')) redis_client.sadd('htmlcache:fetched', url) percent = (100.0 * len(text)) / (len(html) + 1) print ' Size reduced: %d -> %d (%0.2f%%)' % (len(html), len(text), percent) else: print ' Already fetched'
def remove_bad_by_classifier(doc): ps = justext.justext( doc, justext.get_stoplist('English')) to_delete = [] good = [] for p in ps: if p['class'] == 'bad': for el in doc.xpath(p['xpath']): to_delete.append((el, p['xpath'])) elif p['class'] == 'good': good.append(p['xpath']) for el, xp in reversed(to_delete): if el.getparent() is not None and not any(xp in g for g in good): el.drop_tree()
def parse(self, response): hxs = HtmlXPathSelector(response) titulo = hxs.select('/html/head/title/text()').extract() rules = (Rule(SgmlLinkExtractor(allow='.*'),follow=True,callback='parse')) corpo = justext.justext(response.body, justext.get_stoplist('Portuguese')) texto = '' for paragrafo in corpo: if paragrafo['class'] == 'good': texto += paragrafo['text'] item = Pagina() item['url'] = response.url item['titulo'] = unicode(titulo[0]) item['texto'] = unicode(texto) item['tipo'] = self.name return item