def exract_v2(htmlstring_or_filelike): import eatiht.v2 as v2 if os.path.exists(HMTL_DIR+htmlstring_or_filelike): fr = open(HMTL_DIR+htmlstring_or_filelike, 'r') text = v2.extract(fr) else: text = v2.extract(htmlstring_or_filelike) return text
def clean_warc(input): text = v2.extract(input) warc_content = warc.WARCFile(fileobj=StringIO.StringIO(input)) for record in warc_content: url,date = record['WARC-Target-URI'], record['WARC-Date'] return '%s,%s\n%s' % (url, date, text)
def clean_warc(input): text = v2.extract(input) warc_content = warc.WARCFile(fileobj=StringIO.StringIO(input)) for record in warc_content: url, date = record['WARC-Target-URI'], record['WARC-Date'] return '%s,%s\n%s' % (url, date, text)
def try_eatiht(): print("===EATIHT V2===") tree = etv2.extract(ARTICLE) tree.bootstrapify() print(tree.get_html_string()) print("===V2===") print(v2.extract(ARTICLE)) print("===V1===") print(eatiht.extract(ARTICLE))
def scrapinsert(): appbase_app = "3" appbase_doc_type = "article" appbase_app_username = "******" appbase_app_password = "******" ## Ping Appbase that it has been installed and scrapping has started ##Run the scriping wget script here or another file which calls wget and then this file # es = Elasticsearch() # url = "@localhost:9200" body_settings = '{ "analysis": { "filter": { "nGram_filter": { "type": "nGram", "min_gram": 2, "max_gram": 20, "token_chars": [ "letter", "digit", "punctuation", "symbol" ] } }, "analyzer": { "nGram_analyzer": { "type": "custom", "tokenizer": "standard", "filter": [ "lowercase", "asciifolding", "nGram_filter" ] }, "body_analyzer": { "type": "custom", "tokenizer": "standard", "filter": [ "lowercase", "asciifolding", "stop", "snowball", "word_delimiter" ] }, "whitespace_analyzer": { "type": "custom", "tokenizer": "standard", "filter": [ "lowercase", "asciifolding" ] } } } }' body_mapping = '{ "article": { "properties": { "title": { "type": "string", "index_analyzer": "nGram_analyzer", "search_analyzer": "whitespace_analyzer" }, "link": { "type": "string", "index": "not_analyzed" }, "body": { "type": "string", "analyzer": "body_analyzer" } } } }' url = '@scalr.api.appbase.io' es = Elasticsearch('https://' + appbase_app_username + ':' + appbase_app_password + url) es.indices.close(index = appbase_app) es.indices.put_settings(index = appbase_app, body = body_settings) es.indices.open(index = appbase_app) print es.indices.put_mapping(index = appbase_app, body = body_mapping, doc_type = appbase_doc_type) # Make sure that I'm calling to the Appbase elastic search using HTTP Authentication # Creating the Mapping with the elastic search # Going through all the files in a directory and extracting title ## Instead of going through the current directory, go through everything for root, dirnames, filenames in os.walk('yourstory.com'): for file_name in filenames: file_path = os.path.join(root, file_name) # if os.path.isfile(file_name) and "html" in file_name : file = open(file_path, 'r') regex = re.compile('<title>(.*?)</title>', re.IGNORECASE|re.DOTALL) title = regex.search(file.read()) if title: title = title.group(1) body = v2.extract("file://" + os.path.abspath(file_path)) try: ## Remove /n and all such characters from body if body: ## store link as the id and if error we check and then upsert result = es.index(index= appbase_app, doc_type=appbase_doc_type, body={ 'body': body, 'title':title, 'link': file_path }) else: print "Error at " + file_name except: print file_name print "Unable to add it to Elastic Search"
def fetch_wikipage(self, term): # TO DO use wiki_conn if available url = wikipedia + term.replace(' ', '_') try: text = eatiht.extract(url) # Page doesn't exist for this term except HTTPError: return '' text = wiki_footnote_re.sub(' ', text) return text
def extract_news( self, filename ): f = open(filename) content = v2.extract(f) ee = EntityExtract() names = ee.extract_name( content ) doc = { "content": content, "entities": names } f.seek(0) meta_data = { "url" : "og:url", "sitename": "og:site_name", "published_time": "article:published_time" } soup = BeautifulSoup(f) doc['title'] = soup.title.string for k in meta_data.keys(): try: value = soup.find(property= meta_data[k] )['content'] doc[k] = value except Exception as exc: pass if( 'published_time' in doc.keys() ): doc['published_time'] = self._make_datetime( doc.pop('published_time') ) return doc
def getEatiht(url): try: text = v2.extract(url).replace('\n', '') return '', text except Exception, e: return '', ''
def extrac_html(path): file = open(path, "r") return v2.extract(file)
import sys sys.path.append("eatiht/") import eatiht.v2 as v2 url = 'http://www.washingtonpost.com/blogs/the-switch/wp/2014/12/26/elon-musk-the-new-tesla-roadster-can-travel-some-400-miles-on-a-single-charge/' print (v2.extract(url))
#feed = feedparser.parse('http://rss.cnn.com/rss/edition_technology.rss') #feed = feedparser.parse('http://feeds.bbci.co.uk/news/technology/rss.xml') #feed = feedparser.parse('http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml') #feed = feedparser.parse('http://rss.nytimes.com/services/xml/rss/nyt/Sports.xml') #feed = feedparser.parse('http://rss.nytimes.com/services/xml/rss/nyt/Business.xml') #feed = feedparser.parse('http://rss.nytimes.com/services/xml/rss/nyt/World.xml') #feed = feedparser.parse('http://rss.nytimes.com/services/xml/rss/nyt/Travel.xml') feed = feedparser.parse('http://rss.nytimes.com/services/xml/rss/nyt/Arts.xml') topic = feed['feed']['title'] for post in feed.entries: title = post.title link = post.link text_file = open("db/" + title + ".txt", 'w+') record = {} date = post.published if "media_content" in post: image = post["media_content"][0]["url"] record["Image"] = image content = v2.extract(link) summary = summ.getSummary(title, content) keyword = rake.getKeyword(content) record["Title"] = title record["Link"] = link record["Date"] = date #record["Content"] = content record["Summary"] = summary record["Keywords"] = keyword record["Class"] = "Arts" text_file.write(json.dumps(record)) text_file.close()