def collect(filename, query, limit, sentiment, partition): lines = set() # Collect the data from webhose.io with the given query up to the given limit response = webhose.search(query) while len(response.posts) > 0 and len(lines) < limit: # Go over the list of posts returned from the response for post in response.posts: # Verify that the length of the text is not too short nor too long if 1000 > len(post.text) > 50: # Extracting the text from the post object and clean it text = re.sub(r'(\([^\)]+\)|(stars|rating)\s*:\s*\S+)\s*$', '', post.text.replace('\n', '').replace('\t', ''), 0, re.I) # add the post-text to the lines we are going to save in the train/test file lines.add(text.encode('utf8')) time.sleep(2) print 'Getting %s' % response.next # Request the next 100 results from webhose.io response = response.get_next() # Build the train file (first part of the returned documents) with open(os.path.join(resources_dir, filename + '.train'), 'a+') as train_file: for line in list(lines)[:int((len(lines)) * partition)]: train_file.write('%s\t%s\n' % (sentiment, line)) # Build the test file (rest of the returned documents) with open(os.path.join(resources_dir, filename + '.test'), 'a+') as test_file: for line in list(lines)[int((len(lines)) * partition):]: test_file.write('%s\t%s\n' % (sentiment, line))
def search(self, query): """ retrieves result for the searched term from web hose uses webhose sdk to construct query object :param query: :return: search result as a json string """ post = webhose.search(query) # print post return post
def fetch_news_by_location(location): geocode = geocoder.google(location) query = { 'location': location, 'language': 'english', 'site_type': 'news', 'thread.country':geocode.country, } headlines = [x.title for x in webhose.search(query=query, since=int((datetime.now() - timedelta(days=3)).strftime("%s")))] return headlines
def search(): form = SearchForm(request.form) s = None if form.validate(): q = form.s.data session['q'] = q res = webhose.search(q) search_res = [] for i in range(0,3): url = BeautifulSoup(urllib2.urlopen(res.posts[i].url).read()).head.find('meta', attrs={'http-equiv':'refresh'}).get("content")[7::] comb = call_alchemy_combined.call(url) #alchemyapi.combined("url",url) rs1 = comb['docSentiment'] if 'docSentiment' in comb else [] #alchemyapi.sentiment("url", url) rt1 = comb['taxonomy'] if 'taxonomy' in comb else [] rr1 = comb['relations'] if 'relations' in comb else [] #alchemyapi.relations("url", url) rc1 = comb['concepts'] if 'concepts' in comb else [] rk1 = comb['keywords'] if 'keywords' in comb else [] re1 = comb['entities'] if 'entities' in comb else [] em1 = comb['docEmotion'] if 'docEmotion' in comb else [] #alchemyapi.emotion("url", url) #if(rs1['status'] == "OK"): #if(rr1['status']== "OK"): if(comb['status'] == "OK"): for t in rr1: if t['subject']['text']+"-"+t['action']['lemmatized']+("-"+t['object']['text'] if 'object' in t else "") in d: d[t['subject']['text']+"-"+t['action']['lemmatized']+("-"+t['object']['text'] if 'object' in t else "")] = 1 if 'sentiment' in d: d['sentiment'] = rs1['score'] for t in rt1: if t['label'] in d: d[t['label']] = t['score'] for c in rc1: if 'dbpedia' in rc1: if c['dbpedia'] in d: d[c['dbpedia']] = c['relevance'] for k in rk1: if k['text'] in k: d[k['text']] = k['relevance'] for e in re1: if e['text'] in d: d[e['text']] = e['relevance'] for e in em1: if e in d: d[e] = emq[e] o = clf.predict(d)[0] sent = Sentiment(score=rs1['score'],type=rs1['type']) if rs1 is not None and len(rs1)>0 else Sentiment(score="0",type="") search_res.append(SearchItem(url=url,objectivity=Objectivity(score=str(o)),sentiment=sent)) s = Search(query=q,searchItems=search_res) s.save() session['s'] = s else: session['s'] = None return render_template("search/search.html", form=form, search=session['s'])
def extract_top_persons(top_person): top_person = top_person.title() print "extract_top_persons for " + top_person days_back = 30 date_days_ago = datetime.now() - timedelta(days=days_back) top_person = top_person.lower() posts = [] top_person = top_person.replace("\"", "") # clean r = webhose.search("persons:\"" + top_person + "\" domain_rank:<10000", since=int(time.mktime(date_days_ago.timetuple()))) for post in r.posts: posts.append(post) while days_back > 0 or (len(r.posts) == 0): print "days_back = " + str(days_back) days_back -= 5 r = webhose.search("persons:\"" + top_person + "\" domain_rank:<10000", since=int(time.mktime(date_days_ago.timetuple()))) for post in r.posts: posts.append(post) return get_top_persons(posts, top_person)
def test_next(self): """ check that if we use the 'since' parameter from one query, that we don't get any overlap """ # run a "regular" query webhose.config(os.environ['WEBHOSE_TOKEN']) query = webhose.Query() query.some_terms = ('boston','red sox') query.language = 'english' query.site_type = 'news' response = webhose.search(query) # grab some stuff that we need for testing next_ts = response.next_ts last_post_crawled = response.posts[99].crawled_parsed # now run our second query response = webhose.search(query, since=response.next_ts) self.assertGreater(response.posts[99].crawled_parsed, last_post_crawled)
def test_next(self): """ check that if we use the 'since' parameter from one query, that we don't get any overlap """ # run a "regular" query webhose.config(os.environ['WEBHOSE_TOKEN']) query = webhose.Query() query.some_terms = ('boston', 'red sox') query.language = 'english' query.site_type = 'news' response = webhose.search(query) # grab some stuff that we need for testing next_ts = response.next_ts last_post_crawled = response.posts[99].crawled_parsed # now run our second query response = webhose.search(query, since=response.next_ts) self.assertGreater(response.posts[99].crawled_parsed, last_post_crawled)
def fullretrievew(terms,stage1): #Initiate Webhose search instance, return max results webhose.config(token = 'b88b78c1-0dac-4793-913e-7d20e0559144') re = webhose.search(terms) open(stage1, 'w').close() s1 = open(stage1, "a") i = 0 for post in re: i = i+1 try: s1.write("-=-=-ARTICLE " + str(i) + ": " + str(post.thread.url.encode('utf-8')) + "\n") except: s1.write("-=-=-ARTICLE " + str(i) + "\n") try: s1.write("-=-=-TITLE: " + str(post.thread.title_full.encode('utf-8')) + "\n") except: s1.write("-=-=-TITLE: \n") s1.write("-=-=-SNIPPET: \n") try: s1.write("-=-=-TEXT: " + str(post.text.encode('utf-8')) + "\n") except: s1.write("-=-=-TEXT: \n") return
!pip install webhose import webhose webhose.config(token=os.environ["WEBHOSE_KEY"]) ## Just make a call #posts = webhose.search("Obama performance_score:>8") q = webhose.Query() #q.some_terms = ['"big data"','"machine learning"'] #q.title = '"big data" OR "machine learning"' q.phrase = '"data science" OR "machine learning"' print q.query_string() results = webhose.search(q.query_string() + ' performance_score:>1') for post in results.posts: score = post.thread.performance_score if (score > 0): print(post.title + ' by ' + post.thread.site + ' with score ' + str(score)) #print(post.thread.main_image)
import webhose webhose.config(token="ce676c6c-02c7-47f4-a4e3-6f332774a976") for post in webhose.search("github"): print(post.title)
import webhose webhose.config(token="35699326-6aec-4b1e-8aa4-a0794ba56819") r = webhose.search("python java") for i in xrange(0, 20): print r.posts[i].title # print "\n"
def news_search(keyword): webhose.config(token="97029546-2c7f-4116-a16d-e88dd66f09c2") r = webhose.search(keyword) for i in range(1): print(r.posts[i].title)
def search(): form = SearchForm(request.form) s = None if form.validate(): q = form.s.data session['q'] = q res = webhose.search(q) search_res = [] for i in range(0, 3): url = BeautifulSoup(urllib2.urlopen( res.posts[i].url).read()).head.find('meta', attrs={ 'http-equiv': 'refresh' }).get("content")[7::] comb = call_alchemy_combined.call( url) #alchemyapi.combined("url",url) rs1 = comb['docSentiment'] if 'docSentiment' in comb else [ ] #alchemyapi.sentiment("url", url) rt1 = comb['taxonomy'] if 'taxonomy' in comb else [] rr1 = comb['relations'] if 'relations' in comb else [ ] #alchemyapi.relations("url", url) rc1 = comb['concepts'] if 'concepts' in comb else [] rk1 = comb['keywords'] if 'keywords' in comb else [] re1 = comb['entities'] if 'entities' in comb else [] em1 = comb['docEmotion'] if 'docEmotion' in comb else [ ] #alchemyapi.emotion("url", url) #if(rs1['status'] == "OK"): #if(rr1['status']== "OK"): if (comb['status'] == "OK"): for t in rr1: if t['subject']['text'] + "-" + t['action'][ 'lemmatized'] + ("-" + t['object']['text'] if 'object' in t else "") in d: d[t['subject']['text'] + "-" + t['action']['lemmatized'] + ("-" + t['object']['text'] if 'object' in t else "")] = 1 if 'sentiment' in d: d['sentiment'] = rs1['score'] for t in rt1: if t['label'] in d: d[t['label']] = t['score'] for c in rc1: if 'dbpedia' in rc1: if c['dbpedia'] in d: d[c['dbpedia']] = c['relevance'] for k in rk1: if k['text'] in k: d[k['text']] = k['relevance'] for e in re1: if e['text'] in d: d[e['text']] = e['relevance'] for e in em1: if e in d: d[e] = emq[e] o = clf.predict(d)[0] sent = Sentiment( score=rs1['score'], type=rs1['type'] ) if rs1 is not None and len(rs1) > 0 else Sentiment(score="0", type="") search_res.append( SearchItem(url=url, objectivity=Objectivity(score=str(o)), sentiment=sent)) s = Search(query=q, searchItems=search_res) s.save() session['s'] = s else: session['s'] = None return render_template("search/search.html", form=form, search=session['s'])
def runQuery(): queryResults = webhose.search(str(query)) return queryResults
response = unirest.get("https://webhose.io/search?token=f9cf7cbd-5c93-4672-8cb0-f6da249d1808&format=json&q=amazon%20OR%20AMZN&sort=relevancy&ts=1478463348741", headers={ "Accept": "text/plain" } ) print response.body """ import webhose webhose.config(token='f9cf7cbd-5c93-4672-8cb0-f6da249d1808') company_list = ["PayPal"] news_content = {} for org in company_list: r = webhose.search(org) news_content[org] = {} articleNo = 1 while True: for post in r: news_content[org][articleNo] = {} if post.language == 'english' and post.published[:4] == '2016': timestamp = post.published[:10] + post.published[11:19] news_content[org][articleNo][timestamp] = [ post.title, post.text ] #filename = os.getcwd() + os.path.sep + ct._DATADICTIONARY + os.path.sep + '{0}.json'.format(org) #with open(filename,"a") as f: # f.write(str(str(post.title) + str(timestamp) + str('*' * 25) + str(post.text)))
import time from datetime import datetime, timedelta webhose.config(token='c6052904-f312-436b-a6d8-d915084ac866') # backup token days_back = 30 date_days_ago = datetime.now() - timedelta(days=days_back) day = datetime.today().day topic = 'facebook' lang = 'english' country = 'US' # positive stories rp = webhose.search("organization.positive:\"" + topic + "\" language:\"" + lang + "\" thread.country:\"" + country + "\" format:\"" + "json" + "\" domain_rank:<100000", since=int(time.mktime(date_days_ago.timetuple()))) # calculate total number of articles by day for post in ra: p = post.published m = int(p[5:7]) if m == 11: d = int(p[8:10]) else: d = int(p[8:10]) + 30 aa[d - day] += 1 out0 = open('all.txt', 'w') out0.truncate() for i in aa: out0.write(str(i))