Exemplo n.º 1
0
def collect(filename, query, limit, sentiment, partition):
    lines = set()

    # Collect the data from webhose.io with the given query up to the given limit
    response = webhose.search(query)

    while len(response.posts) > 0 and len(lines) < limit:
        # Go over the list of posts returned from the response
        for post in response.posts:
            # Verify that the length of the text is not too short nor too long
            if 1000 > len(post.text) > 50:
                # Extracting the text from the post object and clean it
                text = re.sub(r'(\([^\)]+\)|(stars|rating)\s*:\s*\S+)\s*$', '',
                              post.text.replace('\n', '').replace('\t', ''), 0,
                              re.I)
                # add the post-text to the lines we are going to save in the train/test file
                lines.add(text.encode('utf8'))
        time.sleep(2)
        print 'Getting %s' % response.next
        # Request the next 100 results from webhose.io
        response = response.get_next()

    # Build the train file (first part of the returned documents)
    with open(os.path.join(resources_dir, filename + '.train'),
              'a+') as train_file:
        for line in list(lines)[:int((len(lines)) * partition)]:
            train_file.write('%s\t%s\n' % (sentiment, line))

    # Build the test file (rest of the returned documents)
    with open(os.path.join(resources_dir, filename + '.test'),
              'a+') as test_file:
        for line in list(lines)[int((len(lines)) * partition):]:
            test_file.write('%s\t%s\n' % (sentiment, line))
 def search(self, query):
     """
     retrieves result for the searched term from web hose
     uses webhose sdk to construct query object
     :param query:
     :return: search result as a json string
     """
     post = webhose.search(query)
     # print post
     return post
 def search(self, query):
     """
     retrieves result for the searched term from web hose
     uses webhose sdk to construct query object
     :param query:
     :return: search result as a json string
     """
     post = webhose.search(query)
     # print post
     return post
Exemplo n.º 4
0
def fetch_news_by_location(location):
    geocode = geocoder.google(location)
    query = {
        'location': location,
        'language': 'english',
        'site_type': 'news',
        'thread.country':geocode.country,
    }

    headlines = [x.title for x in webhose.search(query=query, since=int((datetime.now() - timedelta(days=3)).strftime("%s")))]
    return headlines
Exemplo n.º 5
0
def search():
    form = SearchForm(request.form)
    s = None
    if form.validate():
        q = form.s.data
        session['q'] = q
        res = webhose.search(q)
        search_res = []
        for i in range(0,3):
            url = BeautifulSoup(urllib2.urlopen(res.posts[i].url).read()).head.find('meta', attrs={'http-equiv':'refresh'}).get("content")[7::]
            comb = call_alchemy_combined.call(url) #alchemyapi.combined("url",url)
            rs1 = comb['docSentiment'] if 'docSentiment' in comb else [] #alchemyapi.sentiment("url", url)
            rt1 = comb['taxonomy'] if 'taxonomy' in comb else []
            rr1 = comb['relations'] if 'relations' in comb else [] #alchemyapi.relations("url", url)
            rc1 = comb['concepts'] if 'concepts' in comb else []
            rk1 = comb['keywords'] if 'keywords' in comb else []
            re1 = comb['entities'] if 'entities' in comb else []
            em1 = comb['docEmotion'] if 'docEmotion' in comb else [] #alchemyapi.emotion("url", url)
            #if(rs1['status'] == "OK"):
            #if(rr1['status']== "OK"):
            if(comb['status'] == "OK"):
                for t in rr1:
                    if t['subject']['text']+"-"+t['action']['lemmatized']+("-"+t['object']['text'] if 'object' in t else "") in d:
                        d[t['subject']['text']+"-"+t['action']['lemmatized']+("-"+t['object']['text'] if 'object' in t else "")] = 1
                if 'sentiment' in d:
                    d['sentiment'] = rs1['score']
                for t in rt1:
                    if t['label'] in d:
                        d[t['label']] = t['score']
                for c in rc1:
                    if 'dbpedia' in rc1:
                        if c['dbpedia'] in d:
                            d[c['dbpedia']] = c['relevance']
                for k in rk1:
                    if k['text'] in k:
                        d[k['text']] = k['relevance']
                for e in re1:
                    if e['text'] in d:
                        d[e['text']] = e['relevance']
                for e in em1:
                    if e in d:
                        d[e] = emq[e]
            o = clf.predict(d)[0]
            sent = Sentiment(score=rs1['score'],type=rs1['type']) if rs1 is not None and len(rs1)>0 else Sentiment(score="0",type="")
            search_res.append(SearchItem(url=url,objectivity=Objectivity(score=str(o)),sentiment=sent))
        s = Search(query=q,searchItems=search_res)
        s.save()
        session['s'] = s
    else:
        session['s'] = None
    return render_template("search/search.html", form=form, search=session['s'])
def extract_top_persons(top_person):
    top_person = top_person.title()
    print "extract_top_persons for " + top_person
    days_back = 30
    date_days_ago = datetime.now() - timedelta(days=days_back)

    top_person = top_person.lower()
    posts = []
    top_person = top_person.replace("\"", "")  # clean
    r = webhose.search("persons:\"" + top_person + "\" domain_rank:<10000", since=int(time.mktime(date_days_ago.timetuple())))

    for post in r.posts:
        posts.append(post)

    while days_back > 0 or (len(r.posts) == 0):
        print "days_back = " + str(days_back)
        days_back -= 5

        r = webhose.search("persons:\"" + top_person + "\" domain_rank:<10000", since=int(time.mktime(date_days_ago.timetuple())))
        for post in r.posts:
            posts.append(post)

    return get_top_persons(posts, top_person)
    def test_next(self):
        """
        check that if we use the 'since' parameter from one query, that we
        don't get any overlap
        """

        # run a "regular" query
        webhose.config(os.environ['WEBHOSE_TOKEN'])
        query = webhose.Query()
        query.some_terms = ('boston','red sox')
        query.language = 'english'
        query.site_type = 'news'

        response = webhose.search(query)

        # grab some stuff that we need for testing
        next_ts = response.next_ts
        last_post_crawled = response.posts[99].crawled_parsed

        # now run our second query
        response = webhose.search(query, since=response.next_ts)

        self.assertGreater(response.posts[99].crawled_parsed,
                           last_post_crawled)
Exemplo n.º 8
0
    def test_next(self):
        """
        check that if we use the 'since' parameter from one query, that we
        don't get any overlap
        """

        # run a "regular" query
        webhose.config(os.environ['WEBHOSE_TOKEN'])
        query = webhose.Query()
        query.some_terms = ('boston', 'red sox')
        query.language = 'english'
        query.site_type = 'news'

        response = webhose.search(query)

        # grab some stuff that we need for testing
        next_ts = response.next_ts
        last_post_crawled = response.posts[99].crawled_parsed

        # now run our second query
        response = webhose.search(query, since=response.next_ts)

        self.assertGreater(response.posts[99].crawled_parsed,
                           last_post_crawled)
def fullretrievew(terms,stage1):
	#Initiate Webhose search instance, return max results
    	webhose.config(token = 'b88b78c1-0dac-4793-913e-7d20e0559144')
	re = webhose.search(terms)

	open(stage1, 'w').close()
	s1 = open(stage1, "a")
	i = 0
	for post in re:
		i = i+1
		try:
			s1.write("-=-=-ARTICLE " + str(i) + ": " + str(post.thread.url.encode('utf-8')) + "\n")
		except:
			s1.write("-=-=-ARTICLE " + str(i) + "\n")
		try:
			s1.write("-=-=-TITLE: " + str(post.thread.title_full.encode('utf-8')) + "\n")
		except:
			s1.write("-=-=-TITLE: \n")
		s1.write("-=-=-SNIPPET: \n")
		try:
			s1.write("-=-=-TEXT: " + str(post.text.encode('utf-8')) + "\n")
		except:
			s1.write("-=-=-TEXT: \n")
	return
Exemplo n.º 10
0
!pip install webhose

import webhose
webhose.config(token=os.environ["WEBHOSE_KEY"])

## Just make a call
#posts = webhose.search("Obama performance_score:>8")



q = webhose.Query()
#q.some_terms = ['"big data"','"machine learning"']
#q.title = '"big data" OR "machine learning"'
q.phrase = '"data science" OR "machine learning"'
print q.query_string()

results = webhose.search(q.query_string() + ' performance_score:>1')

for post in results.posts:
  score = post.thread.performance_score
  if (score > 0):
    print(post.title + ' by ' + post.thread.site + ' with score ' + str(score))
    #print(post.thread.main_image)
    
 
Exemplo n.º 11
0
import webhose

webhose.config(token="ce676c6c-02c7-47f4-a4e3-6f332774a976")
for post in webhose.search("github"):
    print(post.title)
Exemplo n.º 12
0
import webhose
webhose.config(token="35699326-6aec-4b1e-8aa4-a0794ba56819")
r = webhose.search("python java")
for i in xrange(0, 20):
    print r.posts[i].title
#	print "\n"
Exemplo n.º 13
0
def news_search(keyword):
    webhose.config(token="97029546-2c7f-4116-a16d-e88dd66f09c2")
    r = webhose.search(keyword)
    for i in range(1):
        print(r.posts[i].title)
Exemplo n.º 14
0
def search():
    form = SearchForm(request.form)
    s = None
    if form.validate():
        q = form.s.data
        session['q'] = q
        res = webhose.search(q)
        search_res = []
        for i in range(0, 3):
            url = BeautifulSoup(urllib2.urlopen(
                res.posts[i].url).read()).head.find('meta',
                                                    attrs={
                                                        'http-equiv': 'refresh'
                                                    }).get("content")[7::]
            comb = call_alchemy_combined.call(
                url)  #alchemyapi.combined("url",url)
            rs1 = comb['docSentiment'] if 'docSentiment' in comb else [
            ]  #alchemyapi.sentiment("url", url)
            rt1 = comb['taxonomy'] if 'taxonomy' in comb else []
            rr1 = comb['relations'] if 'relations' in comb else [
            ]  #alchemyapi.relations("url", url)
            rc1 = comb['concepts'] if 'concepts' in comb else []
            rk1 = comb['keywords'] if 'keywords' in comb else []
            re1 = comb['entities'] if 'entities' in comb else []
            em1 = comb['docEmotion'] if 'docEmotion' in comb else [
            ]  #alchemyapi.emotion("url", url)
            #if(rs1['status'] == "OK"):
            #if(rr1['status']== "OK"):
            if (comb['status'] == "OK"):
                for t in rr1:
                    if t['subject']['text'] + "-" + t['action'][
                            'lemmatized'] + ("-" + t['object']['text']
                                             if 'object' in t else "") in d:
                        d[t['subject']['text'] + "-" +
                          t['action']['lemmatized'] +
                          ("-" +
                           t['object']['text'] if 'object' in t else "")] = 1
                if 'sentiment' in d:
                    d['sentiment'] = rs1['score']
                for t in rt1:
                    if t['label'] in d:
                        d[t['label']] = t['score']
                for c in rc1:
                    if 'dbpedia' in rc1:
                        if c['dbpedia'] in d:
                            d[c['dbpedia']] = c['relevance']
                for k in rk1:
                    if k['text'] in k:
                        d[k['text']] = k['relevance']
                for e in re1:
                    if e['text'] in d:
                        d[e['text']] = e['relevance']
                for e in em1:
                    if e in d:
                        d[e] = emq[e]
            o = clf.predict(d)[0]
            sent = Sentiment(
                score=rs1['score'], type=rs1['type']
            ) if rs1 is not None and len(rs1) > 0 else Sentiment(score="0",
                                                                 type="")
            search_res.append(
                SearchItem(url=url,
                           objectivity=Objectivity(score=str(o)),
                           sentiment=sent))
        s = Search(query=q, searchItems=search_res)
        s.save()
        session['s'] = s
    else:
        session['s'] = None
    return render_template("search/search.html",
                           form=form,
                           search=session['s'])
Exemplo n.º 15
0
def runQuery():
    queryResults = webhose.search(str(query))
    return queryResults
response = unirest.get("https://webhose.io/search?token=f9cf7cbd-5c93-4672-8cb0-f6da249d1808&format=json&q=amazon%20OR%20AMZN&sort=relevancy&ts=1478463348741",
    headers={
    "Accept": "text/plain"
    }
)

print response.body
"""

import webhose
webhose.config(token='f9cf7cbd-5c93-4672-8cb0-f6da249d1808')

company_list = ["PayPal"]
news_content = {}
for org in company_list:
    r = webhose.search(org)
    news_content[org] = {}
    articleNo = 1
    while True:
        for post in r:
            news_content[org][articleNo] = {}
            if post.language == 'english' and post.published[:4] == '2016':
                timestamp = post.published[:10] + post.published[11:19]
                news_content[org][articleNo][timestamp] = [
                    post.title, post.text
                ]

                #filename = os.getcwd() + os.path.sep + ct._DATADICTIONARY + os.path.sep + '{0}.json'.format(org)
                #with open(filename,"a") as f:
                #    f.write(str(str(post.title) + str(timestamp) + str('*' * 25) + str(post.text)))
import time
from datetime import datetime, timedelta

webhose.config(token='c6052904-f312-436b-a6d8-d915084ac866')  # backup token

days_back = 30
date_days_ago = datetime.now() - timedelta(days=days_back)
day = datetime.today().day

topic = 'facebook'
lang = 'english'
country = 'US'

# positive stories
rp = webhose.search("organization.positive:\"" + topic + "\" language:\"" +
                    lang + "\" thread.country:\"" + country + "\" format:\"" +
                    "json" + "\" domain_rank:<100000",
                    since=int(time.mktime(date_days_ago.timetuple())))

# calculate total number of articles by day
for post in ra:
    p = post.published
    m = int(p[5:7])
    if m == 11:
        d = int(p[8:10])
    else:
        d = int(p[8:10]) + 30
    aa[d - day] += 1
out0 = open('all.txt', 'w')
out0.truncate()
for i in aa:
    out0.write(str(i))