Exemplo n.º 1
0
def exract_v2(htmlstring_or_filelike):
    import eatiht.v2 as v2
    if os.path.exists(HMTL_DIR+htmlstring_or_filelike):
        fr = open(HMTL_DIR+htmlstring_or_filelike, 'r')
        text = v2.extract(fr)
    else:
        text = v2.extract(htmlstring_or_filelike)
    return text
Exemplo n.º 2
0
def clean_warc(input):

	text = v2.extract(input)
	warc_content = warc.WARCFile(fileobj=StringIO.StringIO(input))

	for record in warc_content:
		url,date =  record['WARC-Target-URI'], record['WARC-Date']

	return '%s,%s\n%s' % (url, date, text)
Exemplo n.º 3
0
def clean_warc(input):

    text = v2.extract(input)
    warc_content = warc.WARCFile(fileobj=StringIO.StringIO(input))

    for record in warc_content:
        url, date = record['WARC-Target-URI'], record['WARC-Date']

    return '%s,%s\n%s' % (url, date, text)
Exemplo n.º 4
0
def try_eatiht():
    print("===EATIHT V2===")
    tree = etv2.extract(ARTICLE)
    tree.bootstrapify()
    print(tree.get_html_string())

    print("===V2===")
    print(v2.extract(ARTICLE))

    print("===V1===")
    print(eatiht.extract(ARTICLE))
Exemplo n.º 5
0
def scrapinsert():
    appbase_app = "3"
    appbase_doc_type = "article"
    appbase_app_username = "******"
    appbase_app_password = "******"

    ## Ping Appbase that it has been installed and scrapping has started

    ##Run the scriping wget script here or another file which calls wget and then this file
    # es = Elasticsearch()
    # url = "@localhost:9200"
    body_settings = '{ "analysis": { "filter": { "nGram_filter": { "type": "nGram", "min_gram": 2, "max_gram": 20, "token_chars": [ "letter", "digit", "punctuation", "symbol" ] } }, "analyzer": { "nGram_analyzer": { "type": "custom", "tokenizer": "standard", "filter": [ "lowercase", "asciifolding", "nGram_filter" ] }, "body_analyzer": { "type": "custom", "tokenizer": "standard", "filter": [ "lowercase", "asciifolding", "stop", "snowball", "word_delimiter" ] }, "whitespace_analyzer": { "type": "custom", "tokenizer": "standard", "filter": [ "lowercase", "asciifolding" ] } } } }'

    body_mapping = '{ "article": { "properties": { "title": { "type": "string", "index_analyzer": "nGram_analyzer", "search_analyzer": "whitespace_analyzer" }, "link": { "type": "string", "index": "not_analyzed" }, "body": { "type": "string", "analyzer": "body_analyzer" } } } }'
    url = '@scalr.api.appbase.io'
    es = Elasticsearch('https://' + appbase_app_username + ':' + appbase_app_password + url)
    es.indices.close(index = appbase_app)
    es.indices.put_settings(index = appbase_app, body = body_settings)
    es.indices.open(index = appbase_app)

    print es.indices.put_mapping(index = appbase_app, body = body_mapping, doc_type = appbase_doc_type)

    # Make sure that I'm calling to the Appbase elastic search using HTTP Authentication
    # Creating the Mapping with the elastic search


    # Going through all the files in a directory and extracting title
    ## Instead of going through the current directory, go through everything
    for root, dirnames, filenames in os.walk('yourstory.com'):
        for file_name in filenames:
            file_path = os.path.join(root, file_name)
            # if os.path.isfile(file_name) and "html" in file_name :
            file = open(file_path, 'r')
            regex = re.compile('<title>(.*?)</title>', re.IGNORECASE|re.DOTALL)
            title = regex.search(file.read())
            if title:
                title = title.group(1)
                body = v2.extract("file://" + os.path.abspath(file_path))
                try:
                    ## Remove /n and all such characters from body
                    if body:
                        ## store link as the id and if error we check and then upsert
                        result = es.index(index= appbase_app, doc_type=appbase_doc_type, body={
                        'body': body,
                        'title':title,
                        'link': file_path
                        })
                    else:
                        print "Error at " + file_name
                except:
                    print file_name
                    print "Unable to add it to Elastic Search"
Exemplo n.º 6
0
    def fetch_wikipage(self, term):
        # TO DO use wiki_conn if available
        url = wikipedia + term.replace(' ', '_')

        try:
            text = eatiht.extract(url)

        # Page doesn't exist for this term
        except HTTPError:
            return ''

        text = wiki_footnote_re.sub(' ', text)
        return text
Exemplo n.º 7
0
    def fetch_wikipage(self, term):
        # TO DO use wiki_conn if available
        url = wikipedia + term.replace(' ', '_')

        try:
            text = eatiht.extract(url)

        # Page doesn't exist for this term
        except HTTPError:
            return ''

        text = wiki_footnote_re.sub(' ', text)
        return text
Exemplo n.º 8
0
    def extract_news( self, filename ):
        f = open(filename)

        content = v2.extract(f)

        ee = EntityExtract()
        names = ee.extract_name( content )

        doc = {
            "content": content,
            "entities": names
        }

        f.seek(0)

        meta_data = {
            "url" : "og:url",
            "sitename": "og:site_name",
            "published_time": "article:published_time"
        }

        soup = BeautifulSoup(f)

        doc['title'] = soup.title.string

        for k in meta_data.keys():
            try:
                value = soup.find(property= meta_data[k] )['content']
                doc[k] = value
            except Exception as exc:
                pass

        if( 'published_time' in doc.keys() ):
            doc['published_time'] = self._make_datetime( doc.pop('published_time') )


        return doc
def getEatiht(url):
    try:
        text = v2.extract(url).replace('\n', '')
        return '', text
    except Exception, e:
        return '', ''
Exemplo n.º 10
0
def extrac_html(path):
    file = open(path, "r")

    return v2.extract(file)
Exemplo n.º 11
0
import sys
sys.path.append("eatiht/")
import eatiht.v2 as v2

url = 'http://www.washingtonpost.com/blogs/the-switch/wp/2014/12/26/elon-musk-the-new-tesla-roadster-can-travel-some-400-miles-on-a-single-charge/'

print (v2.extract(url))
Exemplo n.º 12
0
#feed = feedparser.parse('http://rss.cnn.com/rss/edition_technology.rss')
#feed = feedparser.parse('http://feeds.bbci.co.uk/news/technology/rss.xml')
#feed = feedparser.parse('http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml')
#feed = feedparser.parse('http://rss.nytimes.com/services/xml/rss/nyt/Sports.xml')
#feed = feedparser.parse('http://rss.nytimes.com/services/xml/rss/nyt/Business.xml')
#feed = feedparser.parse('http://rss.nytimes.com/services/xml/rss/nyt/World.xml')
#feed = feedparser.parse('http://rss.nytimes.com/services/xml/rss/nyt/Travel.xml')
feed = feedparser.parse('http://rss.nytimes.com/services/xml/rss/nyt/Arts.xml')
topic = feed['feed']['title']
for post in feed.entries:
	title = post.title
	link = post.link
	text_file = open("db/" + title + ".txt", 'w+')
	record = {}
	date = post.published
	if "media_content" in post:
		image = post["media_content"][0]["url"]
		record["Image"] = image
	content = v2.extract(link)
	summary = summ.getSummary(title, content)
	keyword = rake.getKeyword(content)

	record["Title"] = title
	record["Link"] = link
	record["Date"] = date
	#record["Content"] = content
	record["Summary"] = summary
	record["Keywords"] = keyword
	record["Class"] = "Arts"
	text_file.write(json.dumps(record))
	text_file.close()