def get_webhose_news(ticker): webhoseio.config(token="517a4e21-e484-4eac-aa8c-f50916e8db85") query_params = { "q": """language:english thread.country:US published:>1483232461 (site_category:stocks) (WFC OR wells fargo) (site_type:blogs OR site_type:discussions) (thread.title:'WFC' OR thread.title:'wells fargo')""", "ts": "1533754757303", "sort": "published" } output = webhoseio.query("filterWebContent", query_params) lst = [ x for x in output['posts'][20]['text'].split('. ') for var in names if var in x.lower() ] if len(company_name) >= 2: var1 = company_name[0].lower() var2 = company_name[1].lower() tick = ticker.lower() names = [var1, var2, tick] else: var1 = company_name[0].lower() tick = ticker.lower() names = [var1, tick] barrons_news = [[date, text] for date, text in lst for var in names if var in text]
def get_pages_into_json(domain, n=1): domain = domain num_pages = n webhoseio.config(token="a64af0cc-bb64-44dd-a56d-d1d1e06b287e") query_params = { "q": "language:english", "ts": "1512637551646", "sort": "crawled" } output = webhoseio.query("filterWebContent", query_params) newpath = file_path + '/{}'.format('20171214') if not os.path.exists(newpath): os.makedirs(newpath) with open(newpath + '/data_1.json', 'w') as outfile: json.dump(output, outfile) for p in range(2, num_pages + 1): output = webhoseio.get_next() with open(newpath + '/data_{}.json'.format(p), 'w') as outfile: json.dump(output, outfile)
def index(request): # Create your views here. webhoseio.config(token="8ebd85ed-94da-4ae1-9bd2-def0554ceb64") time_now = datetime.datetime.now() time_30_days_before = time_now - datetime.timedelta(days=30) ts_30_days_before = time_30_days_before.timestamp() query_params = { "q": "(site:bloomberg.com OR site:reuters.com) AND ('mapletree' OR 'capitaland' OR 'Keppel')", "ts": ts_30_days_before, "sort": "published" } output = webhoseio.query("filterWebContent", query_params) context = {'output': output} return render(request, 'news/index.html', context) # def index(request): # # Create your views here. # webhoseio.config(token="8ebd85ed-94da-4ae1-9bd2-def0554ceb64") # query_params = { # "q": "(site:bloomberg.com OR site:reuters.com) AND ('mapletree' OR 'capitaland' OR 'Keppel' OR 'AIMS AMP Capital' OR 'Sabana REIT')", # "ts": "1516537944411", # "sort": "crawled" # } # output = webhoseio.query("filterWebContent", query_params) # return JsonResponse(output)
def config_webhoseio(self, token): # jaju # webhoseio.config(token="ecd3d983-093a-4d8d-a7bd-71207dad85a9") # divya # webhoseio.config(token="e6c1084e-8b63-42cf-bfe3-8ccd24a3a9b1") webhoseio.config(token=token)
def retrieve_topmost_article_new(self, stock_name, num_sentences): """ Retrieves the topmost article about the stock, but solely through the usage of the webhose API. This does not involve checking the database for an already existing article for the stock. :param stock_name: stock name. :param num_sentences: number of sentences to return for summary. :return: a StockArticle object. """ webhoseio.config(token=webhose_api_key) filters = { 'language': 'english', 'text': stock_name, 'site_type': 'news', 'site_category': 'finance', 'thread.title': stock_name } query_result = webhoseio.query('filterWebContent', filters) stock_posts = query_result['posts'] if len(stock_posts) == 0: return None article_url = stock_posts[0].get('url') article_text = stock_posts[0].get('text') article_summary = self.summarize_article(article_text, num_sentences) return StockArticle(stock_name, article_url, article_summary)
def __init__(self): webhoseio.config(token='') self.params = {'q': 'site.type:news', 'from': 0} results = webhoseio.query('nseFilter', self.params) self.total_results = results['totalResults'] self.page = 0 self.batches = max(self.total_results // 10, 10) self.news_batch = results['docs']
def getWebHoseData(location): webhoseio.config(token="b99dbdf5-caac-4a2c-886a-fb8f37f365a0") query_params = { "q": "performance_score:>7 location:"+location, "ts": "1506110156153", "sort": "crawled" } output = webhoseio.query("filterWebContent", query_params) totalWebHose = len(output['posts']) return totalWebHose
def scraper(): current_time = datetime.datetime.utcnow() time_diff = datetime.timedelta(hours=-1 * SCRAPE_TIME_DELTA) start_time = current_time + time_diff start_time = start_time.timestamp() start_time_str = str(round(start_time)) start_time_str = start_time_str.ljust(13, '0') webhoseio.config(token=WEBHOSE_KEY) for category in keywords.keys(): query = query_builder(category, start_time_str) scrape(query, category, start_time_str, time_diff)
def undependant(): query = input('what are you searching for ?') threads = input('how many results you want ?') with open('tango_django/search.key','r') as f: key = f.readline() try: webhoseio.config(token=key) results = webhoseio.query("filterWebContent",{"q":query}) for i in range(len(threads)): print(results['posts'][i]['text']) # for post in results['posts'][:10]['text']: # count += 1 # print(f'result number {count} \n {post}') except KeyError as err: print(f'ooopsie :{err}')
def webhoseio_search(query): key = read_webhoseio_key() results = [] webhoseio.config(token=key) query_params = {'q': query + ' language:english', 'sort': 'relevancy'} output = webhoseio.query('filterWebContent', query_params) for result in output['posts']: results.append({ 'name': result['title'], 'url': result['url'], 'summary': result['published'] }) return results[:10]
def get_webhose_news(ticker): webhoseio.config(token="517a4e21-e484-4eac-aa8c-f50916e8db85") names = get_names(ticker) query_params = { "q": f"""language:english thread.country:US published:>1483232461 (site_category:stocks) ({ticker} OR {names[0]} OR {names[-1]}) (site_type:blogs OR site_type:discussions) (thread.title:'{ticker}' OR thread.title:'{names[0]}' OR thread.title:'{names[-1]}')""", "ts": "1533754757303", "sort": "published" } output = [x for x in webhoseio.query("filterWebContent", query_params)['posts']] lst = [[y['published'], y['text'].replace('\n',' ').lower().split('. ')] for y in output] webhose_new = [[datetime.strptime(date.split('T')[0],'%Y-%m-%d').date(), re.sub('// no comments|posted by','',text)] for date,y in lst for text in y if len(text) < 200 for var in names if var in text] return webhose_new
def extract_data_from_webhose(): webhoseio.config(token="7ad89131-980e-48c3-b588-e68adb7c1be0") s = int(time.time()) - 500 query_params = {"q": "language:english site:amazon.com site_category:shopping spam_score:>0.7", "ts": "{}".format(s), "sort": "crawled"} output = webhoseio.query("filterWebContent", query_params) logging.info(output) key = [] reviewname = [] productname = [] reviewdate = [] rating = [] label = [] sourcetype = [] runtype = [] spam_score = [] text = [] for i in range(0,1): logging.info(i) logging.info(output) key.append(i) reviewname.append(output['posts'][i]['author']) productname.append(output['posts'][i]['thread']['title']) reviewdate.append(output['posts'][i]['thread']['published']) rating.append(output['posts'][i]['thread']['rating']) tt = output['posts'][i]['text'] text.append(tt) ss = output['posts'][i]['thread']['spam_score'] spam_score.append(ss) df= pd.DataFrame() df['key'] = key df['reviewname'] = reviewname df['productname'] = productname df['reviewdate'] = reviewdate df['rating'] = rating df['label'] = 'fake' df['sourcetype'] = 'amazon' df['runtype'] = 'near_real_time' df['text'] = text df['snapshot_time'] = s df.to_gbq('webhoseDB.staging_table', 'wehosestream', if_exists='append', verbose=False)
def webhose_func(): YOUR_API_KEY = "a161f6e5-ab51-40a1-afaf-ba13e67baefa" webhoseio.config(token=YOUR_API_KEY) print("\n") print("WELCOME TO WEBHOSE\n") search = input( "Input the string that you want to search for! It can be somethinglike ipod OR ipad\nType in a list of strings like cow,chicken,pig to plot sentiment for those words against the stock price.\n3 TERMS ARE ENOUGH!\n" ) search_terms = search.split(",") search_df_arr = [] for search in search_terms: search += " language:english" sort = input( "\nType crawled, relevancy, rating or publishes for your sorting option\n" ) timestamp = 1541348859918 size = input( "\nWhat is the number of post returned per request? 1 is the smallest and 100 is the biggest!\n" ) query_params = { "accuracy_confidence": "high", "q": search, "sort": sort, "ts": timestamp, "size": size, } output = webhoseio.query("filterWebContent", query_params) number_of_posts = len(output['posts']) dates = [] for a in range(number_of_posts): dates.append(output['posts'][a]['published']) df = pd.DataFrame(index=dates, columns=["Title"]) for i in range(number_of_posts): df.iloc[i] = [output['posts'][i]['title']] search_df_arr.append(df) search_df_arr = search_df_arr + search_terms return search_df_arr
def main(): webhoseio.config(token="XXXXXXXXXXXXXXXXX" ) # needs to be substituted by real webhoseio token query_params = { "q": "language:english has_video:false is_first:true site_type:news site:(cnn.com OR bbc.com OR reuters.com OR nbcnews.com OR foxnews.com OR washingtonpost.com OR espn.com OR tmz.com OR sportingnews.com OR usnews.com OR wsj.com OR latimes.com OR time.com OR nydailynews.com OR economist.com OR technewsworld.com OR computerworld.com OR newsmax.com OR theatlantic.com OR hollywoodreporter.com) spam_score:0.0", "ts": "1510212713819", "sort": "crawled" } #get 1st set of articles output = webhoseio.query("filterWebContent", query_params) fl_counter = 1 while fl_counter <= 1000: fl_name = "file" + "_" + str(fl_counter) opfile = open('C:/Users/Heena/News3/' + fl_name, 'w', encoding='utf-8') #specify path to corpus folder here for post in output['posts']: uuid = post['uuid'] url = post['url'] site_full = post['thread']['site_full'] site_categories = post['thread']['site_categories'] title_full = post['thread']['title_full'] title = post['title'] published = post['published'] author = post['author'] text = post['text'] doc = document(uuid, url, site_full, site_categories, title, title_full, published, author, text) jsondata = json.dumps(doc.__dict__, sort_keys=True) opfile.write(jsondata + '\n') opfile.close() time.sleep(30) print("fl_counter = ", fl_counter) output = webhoseio.get_next() print("next = ", output['next']) fl_counter += 1
def api_df(token, site_lists, time_delta, filename): """ A pipeline from Webhose API to CSV. :param token: api token for Webhose API. :param site_lists: list of sites we need to crawl. :param time_delta: time window. Ex: -3 means the most recent 3 days. Can only be from -1 to -30. :param filename: filename of CSV. :return: None """ webhoseio.config(token=token) query_params = get_query(site_lists, time_delta) output_init = webhoseio.query("filterWebContent", query_params) output_flat = pd.io.json.json_normalize(output_init['posts']) df = output_flat[[ 'thread.uuid', 'author', 'external_links', 'published', 'text', 'thread.site_full', 'thread.site_categories', 'thread.site_section', 'thread.section_title', 'thread.main_image', 'thread.social.facebook.comments', 'thread.social.facebook.likes', 'thread.social.facebook.shares', 'title', 'url' ]] output = webhoseio.get_next() while len(output['posts']) > 0: df = output_to_df(output, df) try: output = webhoseio.get_next() except HTTPError: return df # df.to_csv(filename, index=False) if len(df) % 1000 == 0: print(str(len(df)) + ' has finished') return df
def related_news(keywords): """ search for related news by keywords use Webhose.io API """ if len(keywords) >= 4: keywords = keywords[0:3] keyword_str = " ".join(keywords) #API key webhoseio.config(token="0e3f95f5-2fc7-494f-881e-e29915cc3e9a") query_params = { "q": keyword_str + " language:english site_type:news", "ts": "1528948373304", "sort": "relevancy" } resp = webhoseio.query("filterWebContent", query_params) posts = resp['posts'] if len(posts) < 2: return None, None, True MAX_ARTICLES = 5 # take first 5 related_articles = [] related_urls = [] for i in range(min(MAX_ARTICLES, len(posts))): post = posts[i]['thread'] related_url = {'url': post['url'], 'title': post['title']} related_urls.append(related_url) related_articles.append(post['site_full']) # currently redirected link return related_articles, related_urls, False
import os from flask import jsonify from google.cloud import language from google.cloud.language import enums from google.cloud.language import types import numpy as np import pandas as pd import webhoseio WEBHOSE_TOK = os.getenv('WEBHOSEIO_TOKEN') if WEBHOSE_TOK is None: print('ERROR: WEBHOSEIO_TOKEN environment variable not set!') webhoseio.config(token=WEBHOSE_TOK) def process_annotations(annotations): content = [] salience = [] sentiment = [] magnitude = [] for entity in annotations.entities: content.append(entity.name) salience.append(entity.salience) sentiment.append(entity.sentiment.score) magnitude.append(entity.sentiment.magnitude) df = pd.DataFrame({ "content": content, "salience": salience,
import webhoseio import pymongo import time from datetime import date from clientDF import DiffbotClient action = input("Do you want to make a new request?(Y/N): ") # List with urls for the diffbot API urlList = [] if action == 'Y': # configuring webhose request webhoseio.config(token="4057ff96-3ff1-4982-8c99-41d708f980ef") # query = "politics language:english thread.country:GB performance_score:>5" query = "Climate Change" query_params = { "q": "Climate Change", "ts": "1518278227788", "sort": "crawled" } output = webhoseio.query("filterWebContent", query_params) # getting the urls of the websites that matched our query/params # saving the urls to a file for verification outputFilename = input("Enter the name of the file which will contain the webhose urls: ") with open(outputFilename, 'w') as urlsOut: urlsOut.write("Query used: "+query+"\n\n") j = 0 while output['posts']:
from django.shortcuts import render, redirect, HttpResponse from django.urls import reverse from django.contrib import messages from django.http import JsonResponse import random import string # Create your views here. from .models import Cart, Order from ..LoginAndRegister.models import User import webhoseio webhoseio.config(token='ba97ec58-a132-4dad-aa67-28e170c1c0d6') def dashboard(request): # check if their is a user_id, basically checking if a user is logged in if 'user_id' not in request.session: return redirect(reverse("userLG:login")) # remember some information about the search history if 'productInfo' not in request.session: request.session['productInfo'] = { 'categories': 'shirt', 'price': '60', 'brand': 'nike' } # those information was used to help the user be able to click on the suggestion product and be able to show the correct information about that specific product. We delete it here becuase we do not need it in this page. if "product" in request.session: del request.session["product"]
import webhoseio from collections import Counter webhoseio.config(token="7f568fb7-7400-47c9-9213-7b074c590dc0") query_params = { "q": "EternalBlue OR WannaCrypt OR ShadowBroker OR EquationAPT OR Lazarus OR LazarusGroup OR Lazarus-Group OR Erebus OR Ransomware OR GoldenEye OR CryptoLocker OR Locky OR Petya OR zCrypt OR PowerWare OR HydraCrypt OR Cerber OR CryptoWall OR CVE", "ts": "1479589849479" } output = webhoseio.query("productFilter", query_params) print(output) print(output["darkposts"][0]["text"]) # Print the text of the first post print(output["darkposts"][0]["title"]) # Print the title of the first post fname = "site_darkweb.txt" # Get the next batch of posts output = webhoseio.get_next() site_freq = Counter() i = 0 # Print the site of the first post while output: site_freq.update(output['darkposts'][i]['source']['site']) i = i + 1 with open(fname, "r") as f: for k, v in c.most_common(): f.write("{},{}\n".format(k, v))
import webhoseio import json import time webhoseio.config(token="e0e8b2b9-80ce-4cdc-bde9-b730ada6c3f9") f1 = open('cities_code_latlng.csv') locs = [] i = 0 for l in f1: #print(l) if i == 0: i = 1 continue ls = l.strip().split(',') #print(ls) #print(ls[0]) #print(ls[2]) locs.append((ls[1], ls[2], ls[3], ls[4])) #print(locs[0]) def get_data(place, country, lat, lng): print(place, country) query_params = { "q": "language:english site_type:news thread.country:{} location:{}".format( country, place), "sort": "relevancy",
import webhoseio webhoseio.config(token="50e8f8f7-33f3-4cfe-8b39-d5ae04c44b6b") query_params = { "q": "(\"West Texas Detention Facility\" OR \"Southwest Key Casa Padre\" OR \"West Texas Detention Facility\" OR \"Southwest Key Casa Padre\" OR \"Southwest Key San Benito/Casa Antigua\" OR \"Southwest Key Combes\" OR \"Southwest Key El Cajon\" OR \"Southwest Key Estrella del Norte\" OR \"Southwest Key Youngtown - Hacienda del Sol\" OR \"Southwest Key Glendale\" OR \"Southwest Key Conroe\" OR \"Joe Corley Detention Facility \" OR \"Southwest Key Casa Quetzal\" OR \"Southwest Key Casa Blanca\" OR \"Southwest Key Casa Montezuma\" OR \"Southwest Key Casa El Presidente\" OR \"Southwest Key Casa Grande\" OR \"Southwest Key Casa Franklin\" OR \"Southwest Key Nueva Esperanza\" OR \"Southwest Key Casa Houston Reliant\" OR \"Southwest Key Shelter Care\" OR \"Southwest Key Casita del Valle\" OR \"Southwest Key Pleasant Hill\" OR \"Southwest Key Las Palmas\" OR \"Southwest Key Lemon Grove\" OR \"Southwest Key La Esperanza Home for Boys\" OR \"Southwest Key Campbell\" OR \"Shiloh Treatment Center\" OR \"Shenendoah Valley Juvenile Center\" OR \"La Salle County Regional Detention Center\" OR \"Alice Peterson Residence \" OR \"Dorothy Mitchell Residence \" OR \"Msgr. Bryan Walsh's Children's Village \" OR \"Board of Childcare\" OR \"Brazoria County Youth Homes\" OR \"Galveston Multicultural Institute\" OR \"The Pelican Island Center\" OR \"Bethany Children's Home\" OR \"Yolo County Juvenile Detention Center\" OR \"Children's Home of Kingston\" OR \"Leake and Watts Passages of Hope\" OR \"Griffin Home Residential Programs - Maceachern House\" OR \"His House Children's Home\" OR \"Homestead Temporary Shelter for Unaccompanied Children\" OR \"Holy Family Institute\" OR \"Inwood House/The Children's Village\" OR \"McAllen Station\" OR \"Lincoln Hall\" OR \"Bokenkamp Children's Shelter\" OR \"Morrison Paso Staff Secure\" OR \"Morrison Knott Street\" OR \"Nuevo Amanecer Latino Children's Services\" OR \"Neighbor to Family\" ) site_type:news thread.country:US language:english", "ts": "1527911096744", "sort": "crawled" } output = webhoseio.query("filterWebContent", query_params) print output['posts'][0]['text'] # Print the text of the first post print output['posts'][0]['published'] # Print the text of the first post publication date # Get the next batch of posts output = webhoseio.get_next() # Print the site of the first post print output['posts'][0]['thread']['site']
import requests import pandas as pd from bs4 import BeautifulSoup as soup import webhoseio # news API from pinance import Pinance # Stock API webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXXXXXXXXX") keywords = [ '$', 'clinical', 'trial', 'phase', 'investment', 'position', 'price', 'buy', 'sell' ] # my_url = 'http://topforeignstocks.com/stock-lists/the-complete-list-of-biotech-stocks-trading-on-nasdaq/' # List of biotech companies (outdated source) # uClient = requests.get(my_url) #downloads webpage # page_html = uClient.text # page_soup = soup(page_html, "html.parser") # bio_tech_companies = page_soup.findAll("td",{"class":"column-2"}) # specific to name # bio_tech_companies_symbol = page_soup.findAll("td",{"class":"column-3"}) # specific to symbol NASDAQ_File = pd.read_csv('NASDAQ_Company_List.csv') Industry = NASDAQ_File['Industry'] Symbol = NASDAQ_File['Symbol'] Name = NASDAQ_File['Name'] for i in range(len(Industry)): if 'Biotechnology' in str(Industry[i]) or 'Pharmaceuticals' in str( Industry[i]): query = str(Name[i]) # gets name of biotech company query_symbol = str(Symbol[i]) # gets symbol of biotech company stock = Pinance(query_symbol) stock.get_quotes() try: # necessary if stock.get_quotes returns an empty or incomplete dictionary if stock.quotes_data['tradeable'] == True: stock_price = ((stock.quotes_data['regularMarketPrice'])) if stock_price < 35.00:
import webhoseio import logging import config logger = logging.getLogger("uniLogger") webhoseio.config(token=config.WEB_HOSE_TOKEN) query_params = { "q": "title:\"big data\" -text:\"big data\" language:english", "sort": "crawled" } output = webhoseio.query("filterWebContent", query_params) print(output['posts'][0]['text']) # Print the text of the first post # Print the text of the first post publication date print(output['posts'][0]['published']) # Get the next batch of posts output = webhoseio.get_next() if len(output['posts']) == 0: exit(0) # Print the site of the first post print(output['posts'][0]['thread']['site']) # doesn't really get the kinds of articles I'm looking for. def scrape_japanese_cuisine(): query_params = { "q": "site_category:japanese_cuisine language:english site_type:blogs", "sort": "crawled"
import json import webhoseio webhoseio.config(token="160a0e9e-b47e-4229-bf40-d7f1e2d5bf35") query_params = { "q": "site:nytimes.com language:english site_type:news spam_score:0", "ts": "1504680060834", "sort": "crawled" } output = webhoseio.query("filterWebContent", query_params) e = [] count = 0 for i in range(50): print(count) print(len(output['posts'])) for x in output['posts']: if x['title']: count += 1 e.append({'title': x['title'], 'text': x['text'], 'label': 'real'}) #time.sleep(300) output = webhoseio.get_next() with open('web75.json', 'w') as outfile: json.dump(e, outfile) print("success")
def config_webhoseio(self): webhoseio.config(token="ecd3d983-093a-4d8d-a7bd-71207dad85a9")
import webhoseio webhoseio.config(token="8d786df0-7885-4818-a4d2-0562b9507f1f") query_params = { "q": "Urovant Sciences language:english site_type:news thread.published:>1535778000000", "sort": "crawled" } output = webhoseio.query("filterWebContent", query_params) print(len(output)) print(output['posts'][0]['text']) # Print the text of the first post print(output['posts'][0] ['published']) # Print the text of the first post publication date print(output) # Get the next batch of posts output = webhoseio.get_next() print(output) # Print the site of the first post print(output['posts'][0])
#! /usr/bin/env python3 #from tinydb import TinyDB, Query import json import webhoseio webhoseio.config(token='11a5bf53-12f6-440d-a84f-e42c18c7c38d') output = webhoseio.query("filterWebContent", { "q": "Global Warming", "sort": "relevancy" }) print("URL: " + output['posts'][0]['url']) # Print the text of the first post print("title: " + output['posts'][0]['title']) # Print the text of the first post print("published: " + output['posts'][0]['published'] ) # Print the text of the first post publication date output = webhoseio.get_next() print("URL: " + output['posts'][0]['url']) # Print the text of the first post print("title: " + output['posts'][0]['title']) # Print the text of the first post print("published: " + output['posts'][0]['published'] ) # Print the text of the first post publication date # try: # response = urlopen(request) # data = response.read() # parsed_json = json.loads(data) # except URLError, e: # print 'API call not working. Got an error code:', e # else:
def update(self): crawledFrom = self.last_updated.timestamp() if abs(self.last_updated - self.last_modified) < timedelta(seconds=1): crawledFrom = (timezone.now() - timedelta(days=3)).timestamp() crawledFrom = int(crawledFrom*1000) webhoseio.config(token='e187b1d6-59c5-4b3b-9614-1c42b3e3658e') output = webhoseio.query( "filterWebContent", { "q": self.query, "ts": crawledFrom, "language": "english", "site_type": "news", }) output = output['posts'] while True: temp = webhoseio.get_next() output += temp['posts'] if temp['moreResultsAvailable'] <= 0: break previous_posts_uuid = [] previous_posts_title = [] if len(output) > 0: previous_posts_uuid = [post.uuid for post in Post.objects.all()] previous_posts_title = [post.title.lower() for post in Post.objects.all()] for post in output: if post['thread']['uuid'] in previous_posts_uuid: old_post = Post.objects.get(uuid = post['thread']['uuid']) if self not in old_post.trackers.all(): old_post.trackers.add(self) elif post['thread']['title'].lower() in previous_posts_title: old_post = Post.objects.get(title__iexact = post['thread']['title']) if self not in old_post.trackers.all(): old_post.trackers.add(self) else: try: new_post = Post( uuid = post['thread']['uuid'], url = post['thread']['url'], site_full = post['thread']['site_full'], site_categories = post['thread']['site_categories'], title = post['thread']['title'][:1024], published = post['thread']['published'], site_type = post['thread']['site_type'], country = post['thread']['country'], main_image = post['thread']['main_image'], performance_score = post['thread']['performance_score'], domain_rank = post['thread']['domain_rank'], author = post['author'], text = post['text'], language = post['language'], entities = post['entities'], social = post['thread']['social'], ) new_post.save() new_post.trackers.add(self) previous_posts_uuid.append(post['thread']['uuid']) previous_posts_title.append(post['thread']['title'].lower()) except DataError as err: print("Error: %s"%(err)) print(post) self.last_updated = timezone.now() self.save() return True
# Enter Webhose_key before proceeding! ## Takes text data from CNN and Fox News and compares the output of each using # Natural Language Processing word frequency statistics import webhoseio import pandas as pd import scattertext as st import spacy.en from scattertext import CorpusFromPandas, produce_scattertext_explorer, word_similarity_explorer from IPython.display import IFrame webhose_key = # Enter Webhose Key Here webhoseio.config(token=webhose_key) def get_headlines(search_term, site): query_params = { "q": search_term + " site:" + site + ".com language:english", "sort": "published" } output = webhoseio.query("filterWebContent", query_params) print('[-] creating ' + site + '_output.txt') file = open(site + '_output.txt','w') try: for x in range(100): file.write(output['posts'][x]['text']) except IndexError: print('[-] Warning: less than 100 results') file.close()
with open("./webhose_results.json", 'w') as outfile: json.dump(output, outfile, sort_keys=True) insertToDB(output["posts"]) ReqNumber = 1 while (output["moreResultsAvailable"]): output = webhoseio.get_next() # do something for subsequent query results with open("./webhose_results_" + str(ReqNumber) + ".json", 'w') as outfile: json.dump(output, outfile, sort_keys=True) insertToDB(output["posts"]) ReqNumber = ReqNumber + 1 if (ReqNumber >= 5): break def getStockList(filename): df_stock_list = pd.read_csv(filename) return df_stock_list webhoseio.config(token=getToken()) df_companylist = pd.read_csv("/home/kasun/Documents/CompanyList.csv") for name in df_companylist['CompanyName']: name = name.strip() name = name.replace(" ", "%20") Qparams = setQueryParams(name, 'english', '', '') print(Qparams) getContent(Qparams)