def test_content_and_content_comments_extractor_blocks(html): """ The content and content/comments extractor should return proper blocks """ content = extract_content(html, as_blocks=True) content_comments = extract_comments(html, as_blocks=True) passed_content = False passed_content_comments = False for i in range(5): # actual_content, actual_content_comments = \ # content_and_content_comments_extractor.analyze( # html, blocks=True) actual_content = extract_content(html, as_blocks=True) actual_content_comments = extract_comments(html, as_blocks=True) passed_content = ([blk.text for blk in actual_content ] == [blk.text for blk in content]) passed_content_comments = ([ blk.text for blk in actual_content_comments ] == [blk.text for blk in content_comments]) if passed_content and passed_content_comments: break assert passed_content assert passed_content_comments
def test_content_and_content_comments_extractor_blocks(self): ''' The content and content/comments extractor should return proper blocks ''' content = extract_content(self._html, as_blocks=True) content_comments = extract_comments(self._html, as_blocks=True) passed_content = False passed_content_comments = False for i in range(5): # actual_content, actual_content_comments = \ # content_and_content_comments_extractor.analyze( # self._html, blocks=True) actual_content = extract_content(self._html, as_blocks=True) actual_content_comments = extract_comments(self._html, as_blocks=True) passed_content = ( [blk.text for blk in actual_content] == [blk.text for blk in content] ) passed_content_comments = ( [blk.text for blk in actual_content_comments] == [blk.text for blk in content_comments] ) if passed_content and passed_content_comments: break self.assertTrue(passed_content) self.assertTrue(passed_content_comments)
def worker(payload): line, _, path, encoding, content, _ = payload if not is_supported_encoding(encoding): return UnknownEncodingError('Unknown encoding: "%s"' % encoding), line, None # Reading file if content is None: try: if path.endswith('.gz'): with open(path, 'rb') as f: raw_html_bytes = gzip.decompress(f.read()) raw_html = raw_html_bytes.decode(encoding, errors='replace') else: with codecs.open(path, 'r', encoding=encoding, errors='replace') as f: raw_html = f.read() except UnicodeDecodeError as e: return e, line, None else: raw_html = content # Attempting extraction try: with warnings.catch_warnings(): warnings.simplefilter('ignore') content = extract_content(raw_html) except BaseException as e: return e, line, None return None, line, content
def test_content_and_content_comments_extractor(html): content = extract_content(html) content_comments = extract_comments(html) passed_content = False passed_content_comments = False for i in range(10): # actual_content, actual_content_comments = \ # extract_content_and_comments(html) actual_content = extract_content(html) actual_content_comments = extract_comments(html) passed_content = actual_content == content passed_content_comments = (actual_content_comments == content_comments) if passed_content and passed_content_comments: break assert passed_content assert passed_content_comments
def process_item(self, item, _): text = extract_content(item['html']) if text: item['text'] = text else: raise DropItem return item
def process_one(self, content): try: soup = BeautifulSoup(content) title = soup.title f_content = extract_content(content) return ContentResult(title, f_content) except Exception as e: logger.error(f"Dragnet failed on {content.title} with error {e}") return ContentResult('', '')
def test_content_and_content_comments_extractor(self): content = extract_content(self._html) content_comments = extract_comments(self._html) passed_content = False passed_content_comments = False for i in range(10): # actual_content, actual_content_comments = \ # extract_content_and_comments(self._html) actual_content = extract_content(self._html) actual_content_comments = extract_comments(self._html) passed_content = actual_content == content passed_content_comments = ( actual_content_comments == content_comments) if passed_content and passed_content_comments: break self.assertTrue(passed_content) self.assertTrue(passed_content_comments)
def process_item(self, item, spider): fullHTML = item['content'] content = extract_content(fullHTML) item['content'] = unicodedata.normalize("NFKD", content).replace( "\n", " ").replace("\t", "").replace(" ", " ") item['link_text'] = unicodedata.normalize( "NFKD", item['link_text']).replace("\n", " ").replace("\t", "").replace(" ", " ") return item
def do_request(url): try: requests.head(url, verify=False, timeout=10, headers=headers) except Exception: return "", 404 try: res = requests.get(url, verify=False, timeout=10, headers=headers) content = extract_content(res.content) return content, res.status_code except Exception: return "", 404
def main(): output = {} for path in Path('html').glob('*.html.gz'): with gzip.open(path, 'rt', encoding='utf8') as f: html = f.read() item_id = path.stem.split('.')[0] content = extract_content(html, encoding='utf8') output[item_id] = {'articleBody': content} (Path('output') / 'dragnet.json').write_text(json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4), encoding='utf8')
def HTML_to_content(filename): """creates a txt file in /data/contentfiles/ with the main content of the HTML file""" if not os.path.isfile( os.path.join(CONTENT_FILES_PATH, filename[:-5] + '.txt')): with open(os.path.join(HTML_FILES_PATH, filename)) as f: html_string = f.read() try: dragnet_result = dragnet.extract_content(html_string) except Exception as e: print('Dragnet extraction error:', e) dragnet_result = 'Dragnet extraction error' with open(CONTENT_FILES_PATH + filename[:-5] + '.txt', 'w') as result: result.write(dragnet_result.encode('utf-8'))
def process_item(self, response): item = MyScraperItem() item['url'] = response.url item['link_text'] = response.meta['link_text'] item['company'] = response.meta['company'] item['content'] = response.body item['keywords'] = response.meta['keywords'] fullHTML = item['content'] content = extract_content(fullHTML) item['content'] = unicodedata.normalize("NFKD", content).replace( "\n", " ").replace("\t", "").replace(" ", " ") item['link_text'] = unicodedata.normalize( "NFKD", item['link_text']).replace("\n", " ").replace("\t", "").replace(" ", " ") valid = True if not item['url']: valid = False raise DropItem("Missing url!") if not item['company']: valid = False raise DropItem("Missing company!") if item['content'] == '': valid = False raise DropItem("empty content") # save to local mongodb database if valid: res_dict = {key: item[key] for key in item} connection = pymongo.MongoClient(port=27017, username='******', password='******', authSource="admin") db = connection['admin'] collection = db[item['company']] collection.insert_one(res_dict) connection.close() return item
def parse_news_text(self, page_html: str, url: str) -> dict: news_text = re.sub(r'\s+', r' ', extract_content(page_html, encoding='utf-8')) return {'url': url, 'text': news_text}
def benchmark(extract_size=800): """Picks a random html file and prints an extract of the result of each method""" random_file = random_html_file() with open(join(DATA_PATH, random_file), 'r') as f: html_string = f.read() # GOOSE try: g = Goose({ 'browser_user_agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0', 'enable_image_fetching': False }) goose_article = g.extract(raw_html=html_string) goose_result = goose_article.cleaned_text except: goose_result = ' Goose error.' # EATIHT try: eatiht_result = eatiht.extract(html_string) except: eatiht_result = ' Eatiht error.' # DRAGNET try: dragnet_result = dragnet.extract_content(html_string) except Exception as e: dragnet_result = ' Dragnet error: ' + str(e) # LIBEXTRACT try: textnodes = list(libextract.api.extract(html_string)) libextract_result = textnodes[0].text_content() except: libextract_result = ' Libextract error.' # BOILERPIPE (CanolaExtractor) try: extractor = Extractor(extractor='CanolaExtractor', html=html_string) boilerpipe_result = extractor.getText() except: boilerpipe_result = ' Boilerpipe error.' # NEWSPAPER try: article = Article('url') article.download(input_html=html_string) article.parse() print('Auteurs:', article.authors) print('Date de publication:', article.publish_date) newspaper_result = article.text except: newspaper_result = ' Newspaper error.' # JUSTEXT try: paragraphs = justext.justext(html_string, justext.get_stoplist("French")) print('PARAGRAPHS') for p in paragraphs: if not p.is_boilerplate: print(p.text) justext_result = '\n'.join(paragraph.text for paragraph in paragraphs if not paragraph.is_boilerplate) print('JUSTEXT_RESULT', justext_result) except Exception as e: justext_result = ' Justext error: ' + str(e) print(justext_result) # Results try: # finds the url associated with the file in a "filename-url" csv with open('./data/urls.csv', 'r') as csvfile: urls = dict((line['id'], line['url']) for line in csv.DictReader(csvfile)) url = urls[random_file[:-5]] print('\n\n >>> URL n.' + random_file[:-5] + ' : ' + url) except: print( '\n\n (URL of the html file not found. To print the associated URL, please provide a urls.csv file featuring filename & url in /data)' ) # webbrowser.open(url, autoraise=False) path = abspath('temp.html') local_url = 'file://' + path with open(path, 'w') as f: f.write(html_string) webbrowser.open(local_url) # print('\n\n /// GOOSE /// \n') # print(goose_result[:extract_size]) # print('\n\n /// EATIHT /// \n') # print(eatiht_result[:extract_size]) print('\n ------ [[DRAGNET]] ------', len(dragnet_result), 'caractères\n') print(dragnet_result[:extract_size] + '\n...\n' + dragnet_result[-extract_size:]) print('\n ------ [[NEWSPAPER]] ------', len(newspaper_result), 'caractères\n') print(newspaper_result[:extract_size] + '\n...\n' + newspaper_result[-extract_size:]) print('\n ------ [[JUSTEXT]] ------', len(justext_result), 'caractères\n') print(justext_result[:extract_size] + '\n...\n' + justext_result[-extract_size:]) # print('\n\n /// LIBEXTRACT /// \n') # print(libextract_result[:extract_size]) # print('\n\n /// BOILERPIPE (CanolaExtractor) /// \n\n') # print(boilerpipe_result[:extract_size]) # print('\n\n') return (url)
def scraper(self, html, link): text = extract_content(html) if "cnbc" in link: # in this case content is extracted also in the comments text += " " + extract_comments(html) text = text.split("disclaimer")[0] return text
url = 'https://tech.sina.com.cn/i/2019-04-29/doc-ihvhiqax5802337.shtml' response = requests.get(url) htmlcode = response.content readability = Readability(htmlcode, url) print(readability.title) print(readability.content) q.d() s_html = readability.content from dragnet import extract_content, extract_content_and_comments print(s_html) q.d() content = extract_content(s_html) print(content) q.d() # get article and comments content_comments = extract_content_and_comments(content) print(content_comments) q.d()
def get_webpage(url): r = requests.get(url) content = extract_content(r.content) return content
try: if result.language == 'ja': extractor.analyse(content) text, title = extractor.as_text() text = re.sub('名前:[^\s]+', '', text) text = re.sub('ID:[^\s]+', '', text) text = re.sub('https?:[^\s]+', '', text) text = re.sub('[0-9]+ +[0-9]+:[0-9]+:[0-9]+\.[0-9]+', '', text) text = re.sub('[<<>>]+\s?[0-9\s]+', '', text) text = re.sub('引用元:', '', text) text = re.sub( '[0-9]+\s+:[0-9]+\/[0-9]+\/[0-9]+.+?[0-9]+:[0-9]+:[0-9]+\.[0-9]+', '', text) text = re.sub('<[^>]+>', '', text) else: text = extract_content(content) except Exception as e: print(e) if not text: text = result.summary extracted_content = result.title.strip() + " " + text.strip() # extract main image bs = BeautifulSoup(content, "lxml") max = 0 primary_image_url = None for image_url in bs.find_all('img'): src = image_url.get('src')
def parse_article(self, response): news_id = 19684 #response.meta.get('news_id') # save to file with open(str(news_id) + '.html', 'wb') as fh: fh.write(response.body) article = Article(response.url) # set html manually with open(str(news_id) + '.html', 'rb') as fh: article.html = fh.read() os.remove(str(news_id) + '.html') # need to set download_state to 2 for this to work article.download_state = 2 article.parse() article.nlp() date = article.publish_date keywords = str([x.replace("'", "''") for x in article.keywords]).replace('"', '\'') content = article.text.replace("'", "''") summary = article.summary.replace("'", "''") title = article.title.replace("'", "''") if date is None: date = 'null' else: date = "'" + str(date) + "'" authors = str([x.replace("'", "''") for x in article.authors]).replace('"', '\'') tags = str([x.replace("'", "''") for x in article.meta_keywords]).replace('"', '\'') dbconnector.execute( self.conn, 'INSERT INTO "ParsedNews-newspaper"("IDNews", "Date", "Content", "Keywords", ' + '"Summary", "Authors", "Tags", "Title") ' + 'VALUES (' + str(news_id) + ', ' + str(date) + ', \'' + content + '\', ARRAY ' + str(keywords) + '::text[], \'' + summary + '\', ARRAY ' + str(authors) + '::text[], ARRAY ' + str(tags) + '::text[], \'' + title + '\')') # get main article without comments content = extract_content(response.text).replace("'", "''") # get article and comments content_comments = '[\'' + extract_content_and_comments( response.text).replace("'", "''") + '\']' dbconnector.execute( self.conn, 'INSERT INTO "ParsedNews-dragnet"("IDNews", "Content", "Comments") ' + 'VALUES (' + str(news_id) + ', \'' + content + '\', ARRAY ' + str(content_comments) + '::text[])') date = articleDateExtractor.extractArticlePublishedDate( articleLink=response.url, html=response.text) if date is not None: dbconnector.execute( self.conn, 'INSERT INTO "ParsedNews-ade"("IDNews", "Date") ' + 'VALUES (' + str(news_id) + ', \'' + str(date) + '\')') g = Goose() article = g.extract(raw_html=response.text) date = article.publish_datetime_utc keywords = str([x.replace("'", "''") for x in article.tags]).replace('"', '\'') content = article.cleaned_text.replace("'", "''") summary = article.meta_description.replace("'", "''") title = article.title.replace("'", "''") if date is None: date = 'null' else: date = "'" + str(date) + "'" authors = str([x.replace("'", "''") for x in article.authors]).replace('"', '\'') tags = str([ x.replace("'", "''") for x in article.meta_keywords.split(",") ]).replace('"', '\'') tweets = str([x.replace("'", "''") for x in article.tweets]).replace('"', '\'') dbconnector.execute( self.conn, 'INSERT INTO "ParsedNews-goose"(' + '"IDNews", "Date", "Content", "Keywords", "Summary", ' + '"Authors", "Tags", "Tweets",' + '"Title") VALUES (' + str(news_id) + ', ' + date + ', \'' + content + '\', ARRAY ' + str(keywords) + '::text[], \'' + str(summary) + '\', ARRAY ' + str(authors) + '::text[], ARRAY ' + str(tags) + '::text[], ARRAY ' + str(tweets) + '::text[], \'' + str(title) + '\')') pass
def save_snapshot(self): try: r = requests.get(self.url) except ( requests.exceptions.SSLError, requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout, ) as e: print(e) return None snapshot = { "bookmark": self, "content": r.text, "headers_json": json.dumps( {item[0]: item[1] for item in r.headers.items()} ), "status_code": r.status_code, } try: ogp = LaterOpenGraph(html=r.text) snapshot["opengraph_json"] = ogp.to_json() except AttributeError: print("OpenGraph Error") pass try: snapshot["parsed_content"] = extract_content(r.text) except BlockifyError: print("dragnet extract_content: BlockifyError") snapshot["parsed_content"] = "" pass try: tags = favicon_tags(self.url, r.text) tags = sorted(tags, key=lambda i: i.width + i.height, reverse=True) snapshot["favicon"] = tags[0].url print(snapshot["favicon"]) except IndexError: print("No Favicon Found") pass try: tr4w = TextRank4Keyword() tr4w.analyze(snapshot["parsed_content"]) keywords_weighted = tr4w.node_weight.items() keywords_sorted = sorted( keywords_weighted, key=lambda item: item[1], reverse=True ) tags = [k.lower() for (k, v) in keywords_sorted if len(k) < 100][:9] self.tags.add(*tags) except MemoryError: print("MemoryError while parsing keywords") pass # If the bookmark does not yet have a title, grab it from the document title if not self.title: try: parser = etree.XMLParser(recover=True) document = etree.fromstring(r.text, parser) self.title = document.find(".//title").text self.save() except ValueError: print("Error parsing document...") pass except AttributeError: print("No title tag found...") pass # If we still don't have a title, grab it from the opengraph tags if not self.title and ogp.get("title"): self.title = ogp.get("title") self.save() return Snapshot.objects.create(**snapshot)
def run_dragnet(htmlstring): '''try with the dragnet module''' content = extract_content(htmlstring) return content # sanitize(content)
def parse_keywords(self, response): item = Content() item["object_id"] = self.object_id item["content"] = extract_content(response.body) return item
import requests from dragnet import extract_content, extract_content_and_comments import q # fetch HTML # url = 'https://moz.com/devblog/dragnet-content-extraction-from-diverse-feature-sets/' url = 'https://tech.sina.com.cn/i/2019-04-29/doc-ihvhiqax5802337.shtml' r = requests.get(url) # get main article without comments content = extract_content(r.content) print(content) q.d() # get article and comments content_comments = extract_content_and_comments(r.content) print(content_comments) q.d()
def process_item(self, item, spider): fullHTML = item['content'] content = extract_content(fullHTML) item['content'] = content return item
def dragnet_extract_content(): return extract_content(request.data.decode('utf-8'))
def extract(self, html): if self.warm is False: string = dragnet.extract_content("test") print("Warmed DrageNet Model") self.warm = True return dragnet.extract_content(html)
def results(): # form = request.form if request.method == 'POST': # write your function that loads the model # model = get_model() #you can use pickle to load the trained model # model = pickle.load(open('model.pkl', 'rb')) # Extract the content url = request.form['url'] r = requests.get(url) content = extract_content(r.content) # text = content.split('\n')[0] + content.split('\n')[1] ## get the first and second sentence # Extract the headline headline = headline_func(url) # merge the headline and the first sentence text = headline + " " + content.split('\n')[0] # pass the text into preprocessing function preprocessed_text = text_prep(text) # #predict gategory predicted = cat_prediction(preprocessed_text)[0] predicted = category.get(predicted) # #Predicting the topics for a document doc = preprocessed_text.split() doc_vector = lda_model.id2word.doc2bow(doc) doc_topics = lda_model[doc_vector] sorted_by_prob = sorted(doc_topics, key=lambda tup: tup[1], reverse=True) # return render_template('resultsform.html', text=text, predicted_category=predicted) # return Response() # #Sentiment of the News by Vader text_series = pd.Series(preprocessed_text) score = text_series.apply( lambda t: analyser.polarity_scores(t)['compound']) sentiment_vader = 'positive' if score[ 0] > 0 else 'negative' if score[0] < 0 else 'neutral' # Sentiment by textblob: PatterAnalyzer blob = TextBlob(text) pol = blob.sentences[0].sentiment.polarity print(text) return jsonify({ 'data': { "category": predicted, "url": url, "body": content, "topics": topics.get(sorted_by_prob[0][0]), "Sentiment_vader": sentiment_vader, "Sentiment_score_vader": score[0], "Sentiment_textblob": pol, "headline": headline } })