Пример #1
0
def get_webhose_news(ticker):
    webhoseio.config(token="517a4e21-e484-4eac-aa8c-f50916e8db85")
    query_params = {
        "q":
        """language:english thread.country:US published:>1483232461 (site_category:stocks) (WFC OR wells fargo)
                (site_type:blogs OR site_type:discussions) (thread.title:'WFC' OR thread.title:'wells fargo')""",
        "ts": "1533754757303",
        "sort": "published"
    }
    output = webhoseio.query("filterWebContent", query_params)
    lst = [
        x for x in output['posts'][20]['text'].split('. ') for var in names
        if var in x.lower()
    ]

    if len(company_name) >= 2:
        var1 = company_name[0].lower()
        var2 = company_name[1].lower()
        tick = ticker.lower()
        names = [var1, var2, tick]
    else:
        var1 = company_name[0].lower()
        tick = ticker.lower()
        names = [var1, tick]
    barrons_news = [[date, text] for date, text in lst for var in names
                    if var in text]
def get_pages_into_json(domain, n=1):
    domain = domain
    num_pages = n

    webhoseio.config(token="a64af0cc-bb64-44dd-a56d-d1d1e06b287e")
    query_params = {
        "q": "language:english",
        "ts": "1512637551646",
        "sort": "crawled"
    }

    output = webhoseio.query("filterWebContent", query_params)

    newpath = file_path + '/{}'.format('20171214')

    if not os.path.exists(newpath):
        os.makedirs(newpath)

    with open(newpath + '/data_1.json', 'w') as outfile:
        json.dump(output, outfile)

    for p in range(2, num_pages + 1):
        output = webhoseio.get_next()
        with open(newpath + '/data_{}.json'.format(p), 'w') as outfile:
            json.dump(output, outfile)
Пример #3
0
def index(request):
    # Create your views here.
    webhoseio.config(token="8ebd85ed-94da-4ae1-9bd2-def0554ceb64")
    time_now = datetime.datetime.now() 
    time_30_days_before =  time_now - datetime.timedelta(days=30)
    ts_30_days_before = time_30_days_before.timestamp()
    query_params = {
    "q": "(site:bloomberg.com OR site:reuters.com) AND ('mapletree' OR 'capitaland' OR 'Keppel')",
    "ts": ts_30_days_before,
    "sort": "published"
    }
    output = webhoseio.query("filterWebContent", query_params)
    context = {'output': output}
    return render(request, 'news/index.html', context)

# def index(request):
#     # Create your views here.
#     webhoseio.config(token="8ebd85ed-94da-4ae1-9bd2-def0554ceb64")
#     query_params = {
#     "q": "(site:bloomberg.com OR site:reuters.com) AND ('mapletree' OR 'capitaland' OR 'Keppel' OR 'AIMS AMP Capital' OR 'Sabana REIT')",
#     "ts": "1516537944411",
#     "sort": "crawled"
#     }
#     output = webhoseio.query("filterWebContent", query_params)
#     return JsonResponse(output)
    def config_webhoseio(self, token):

        # jaju
        # webhoseio.config(token="ecd3d983-093a-4d8d-a7bd-71207dad85a9")
        # divya
        # webhoseio.config(token="e6c1084e-8b63-42cf-bfe3-8ccd24a3a9b1")
        webhoseio.config(token=token)
Пример #5
0
    def retrieve_topmost_article_new(self, stock_name, num_sentences):
        """
        Retrieves the topmost article about the stock, but solely through the usage of the webhose API. This
        does not involve checking the database for an already existing article for the stock.
        :param stock_name:    stock name.
        :param num_sentences: number of sentences to return for summary.
        :return:              a StockArticle object.
        """
        webhoseio.config(token=webhose_api_key)
        filters = {
            'language': 'english',
            'text': stock_name,
            'site_type': 'news',
            'site_category': 'finance',
            'thread.title': stock_name
        }
        query_result = webhoseio.query('filterWebContent', filters)
        stock_posts = query_result['posts']
        if len(stock_posts) == 0:
            return None
        article_url = stock_posts[0].get('url')
        article_text = stock_posts[0].get('text')
        article_summary = self.summarize_article(article_text, num_sentences)

        return StockArticle(stock_name, article_url, article_summary)
Пример #6
0
 def __init__(self):
     webhoseio.config(token='')
     self.params = {'q': 'site.type:news', 'from': 0}
     results = webhoseio.query('nseFilter', self.params)
     self.total_results = results['totalResults']
     self.page = 0
     self.batches = max(self.total_results // 10, 10)
     self.news_batch = results['docs']
Пример #7
0
def getWebHoseData(location):
	webhoseio.config(token="b99dbdf5-caac-4a2c-886a-fb8f37f365a0")

	query_params = {
	"q": "performance_score:>7 location:"+location,
	"ts": "1506110156153",
	"sort": "crawled" }

    
  	output = webhoseio.query("filterWebContent", query_params)
  	totalWebHose = len(output['posts'])
 	return totalWebHose
Пример #8
0
def scraper():
    current_time = datetime.datetime.utcnow()
    time_diff = datetime.timedelta(hours=-1 * SCRAPE_TIME_DELTA)
    start_time = current_time + time_diff
    start_time = start_time.timestamp()
    start_time_str = str(round(start_time))
    start_time_str = start_time_str.ljust(13, '0')

    webhoseio.config(token=WEBHOSE_KEY)

    for category in keywords.keys():
        query = query_builder(category, start_time_str)
        scrape(query, category, start_time_str, time_diff)
Пример #9
0
def undependant():
    query = input('what are you searching for ?')
    threads = input('how many results you want ?')
    with open('tango_django/search.key','r') as f:
        key = f.readline()
    try:
        webhoseio.config(token=key)
        results = webhoseio.query("filterWebContent",{"q":query})
        for i in range(len(threads)):
            print(results['posts'][i]['text'])
        # for post in results['posts'][:10]['text']:
        #     count += 1
        #     print(f'result number {count} \n {post}')
    except KeyError as err:
        print(f'ooopsie :{err}')
def webhoseio_search(query):

    key = read_webhoseio_key()
    results = []

    webhoseio.config(token=key)
    query_params = {'q': query + ' language:english', 'sort': 'relevancy'}
    output = webhoseio.query('filterWebContent', query_params)

    for result in output['posts']:
        results.append({
            'name': result['title'],
            'url': result['url'],
            'summary': result['published']
        })

    return results[:10]
Пример #11
0
def get_webhose_news(ticker):
    webhoseio.config(token="517a4e21-e484-4eac-aa8c-f50916e8db85")
    names = get_names(ticker)
    query_params = {
        "q": f"""language:english thread.country:US published:>1483232461 (site_category:stocks) ({ticker} OR {names[0]} OR {names[-1]})
                (site_type:blogs OR site_type:discussions) (thread.title:'{ticker}' OR thread.title:'{names[0]}' OR thread.title:'{names[-1]}')""",
        "ts": "1533754757303",
        "sort": "published"
        }
    output = [x for x in webhoseio.query("filterWebContent", query_params)['posts']]
    lst = [[y['published'],
            y['text'].replace('\n',' ').lower().split('. ')] for y in output]
    webhose_new = [[datetime.strptime(date.split('T')[0],'%Y-%m-%d').date(),
                    re.sub('// no comments|posted by','',text)] 
                    for date,y in lst for text in y if len(text) < 200
                    for var in names if var in text]
    return webhose_new
    def extract_data_from_webhose():
        webhoseio.config(token="7ad89131-980e-48c3-b588-e68adb7c1be0")
        s = int(time.time()) - 500
        query_params = {"q": "language:english site:amazon.com site_category:shopping spam_score:>0.7", "ts": "{}".format(s), "sort": "crawled"}
        output = webhoseio.query("filterWebContent", query_params)
        logging.info(output)
        key = []
        reviewname = []
        productname = []
        reviewdate = []
        rating = []
        label = []
        sourcetype = []
        runtype = []
        spam_score = []
        text = []
        for i in range(0,1):
            logging.info(i)
            logging.info(output)
            key.append(i)
            reviewname.append(output['posts'][i]['author'])
            productname.append(output['posts'][i]['thread']['title'])
            reviewdate.append(output['posts'][i]['thread']['published'])
            rating.append(output['posts'][i]['thread']['rating'])
            tt = output['posts'][i]['text']
            text.append(tt)
            ss = output['posts'][i]['thread']['spam_score']
            spam_score.append(ss)
        df= pd.DataFrame()
        df['key'] = key
        df['reviewname'] = reviewname
        df['productname'] = productname
        df['reviewdate'] = reviewdate
        df['rating'] = rating
        df['label'] = 'fake'
        df['sourcetype'] = 'amazon'
        df['runtype'] = 'near_real_time'
        df['text'] = text  
        df['snapshot_time'] = s
		
        df.to_gbq('webhoseDB.staging_table', 'wehosestream', if_exists='append', verbose=False)
Пример #13
0
def webhose_func():
    YOUR_API_KEY = "a161f6e5-ab51-40a1-afaf-ba13e67baefa"
    webhoseio.config(token=YOUR_API_KEY)
    print("\n")
    print("WELCOME TO WEBHOSE\n")
    search = input(
        "Input the string that you want to search for! It can be somethinglike ipod OR ipad\nType in a list of strings like cow,chicken,pig to plot sentiment for those words against the stock price.\n3 TERMS ARE ENOUGH!\n"
    )
    search_terms = search.split(",")
    search_df_arr = []
    for search in search_terms:
        search += " language:english"
        sort = input(
            "\nType crawled, relevancy, rating or publishes for your sorting option\n"
        )
        timestamp = 1541348859918
        size = input(
            "\nWhat is the number of post returned per request? 1 is the smallest and 100 is the biggest!\n"
        )
        query_params = {
            "accuracy_confidence": "high",
            "q": search,
            "sort": sort,
            "ts": timestamp,
            "size": size,
        }
        output = webhoseio.query("filterWebContent", query_params)
        number_of_posts = len(output['posts'])
        dates = []

        for a in range(number_of_posts):
            dates.append(output['posts'][a]['published'])

        df = pd.DataFrame(index=dates, columns=["Title"])
        for i in range(number_of_posts):
            df.iloc[i] = [output['posts'][i]['title']]
        search_df_arr.append(df)

    search_df_arr = search_df_arr + search_terms
    return search_df_arr
Пример #14
0
def main():
    webhoseio.config(token="XXXXXXXXXXXXXXXXX"
                     )  # needs to be substituted by real webhoseio token
    query_params = {
        "q":
        "language:english has_video:false is_first:true site_type:news site:(cnn.com OR bbc.com OR reuters.com OR nbcnews.com OR foxnews.com OR washingtonpost.com OR espn.com OR tmz.com OR sportingnews.com OR usnews.com OR wsj.com OR latimes.com OR time.com OR nydailynews.com OR economist.com OR technewsworld.com OR computerworld.com OR newsmax.com OR theatlantic.com OR hollywoodreporter.com) spam_score:0.0",
        "ts": "1510212713819",
        "sort": "crawled"
    }
    #get 1st set of articles
    output = webhoseio.query("filterWebContent", query_params)

    fl_counter = 1
    while fl_counter <= 1000:
        fl_name = "file" + "_" + str(fl_counter)
        opfile = open('C:/Users/Heena/News3/' + fl_name, 'w',
                      encoding='utf-8')  #specify path to corpus folder here
        for post in output['posts']:
            uuid = post['uuid']
            url = post['url']
            site_full = post['thread']['site_full']
            site_categories = post['thread']['site_categories']
            title_full = post['thread']['title_full']
            title = post['title']
            published = post['published']
            author = post['author']
            text = post['text']

            doc = document(uuid, url, site_full, site_categories, title,
                           title_full, published, author, text)
            jsondata = json.dumps(doc.__dict__, sort_keys=True)
            opfile.write(jsondata + '\n')

        opfile.close()
        time.sleep(30)
        print("fl_counter = ", fl_counter)
        output = webhoseio.get_next()
        print("next = ", output['next'])
        fl_counter += 1
Пример #15
0
def api_df(token, site_lists, time_delta, filename):
    """
    A pipeline from Webhose API to CSV.

    :param token:
        api token for Webhose API.
    :param site_lists:
        list of sites we need to crawl.
    :param time_delta:
        time window. Ex: -3 means the most recent 3 days. Can only be from -1 to -30.
    :param filename:
        filename of CSV.
    :return:
        None
    """
    webhoseio.config(token=token)
    query_params = get_query(site_lists, time_delta)
    output_init = webhoseio.query("filterWebContent", query_params)
    output_flat = pd.io.json.json_normalize(output_init['posts'])
    df = output_flat[[
        'thread.uuid', 'author', 'external_links', 'published', 'text',
        'thread.site_full', 'thread.site_categories', 'thread.site_section',
        'thread.section_title', 'thread.main_image',
        'thread.social.facebook.comments', 'thread.social.facebook.likes',
        'thread.social.facebook.shares', 'title', 'url'
    ]]
    output = webhoseio.get_next()
    while len(output['posts']) > 0:
        df = output_to_df(output, df)
        try:
            output = webhoseio.get_next()
        except HTTPError:
            return df
            # df.to_csv(filename, index=False)
        if len(df) % 1000 == 0:
            print(str(len(df)) + ' has finished')
    return df
Пример #16
0
def related_news(keywords):
    """
    search for related news by keywords
    use Webhose.io API
    """

    if len(keywords) >= 4:
        keywords = keywords[0:3]

    keyword_str = " ".join(keywords)

    #API key
    webhoseio.config(token="0e3f95f5-2fc7-494f-881e-e29915cc3e9a")
    query_params = {
        "q": keyword_str + " language:english site_type:news",
        "ts": "1528948373304",
        "sort": "relevancy"
    }

    resp = webhoseio.query("filterWebContent", query_params)
    posts = resp['posts']

    if len(posts) < 2:
        return None, None, True

    MAX_ARTICLES = 5  # take first 5

    related_articles = []
    related_urls = []

    for i in range(min(MAX_ARTICLES, len(posts))):
        post = posts[i]['thread']
        related_url = {'url': post['url'], 'title': post['title']}
        related_urls.append(related_url)
        related_articles.append(post['site_full'])  # currently redirected link

    return related_articles, related_urls, False
Пример #17
0
import os

from flask import jsonify
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
import numpy as np
import pandas as pd
import webhoseio

WEBHOSE_TOK = os.getenv('WEBHOSEIO_TOKEN')
if WEBHOSE_TOK is None:
    print('ERROR: WEBHOSEIO_TOKEN environment variable not set!')
webhoseio.config(token=WEBHOSE_TOK)


def process_annotations(annotations):
    content = []
    salience = []
    sentiment = []
    magnitude = []

    for entity in annotations.entities:
        content.append(entity.name)
        salience.append(entity.salience)
        sentiment.append(entity.sentiment.score)
        magnitude.append(entity.sentiment.magnitude)

    df = pd.DataFrame({
        "content": content,
        "salience": salience,
Пример #18
0
import webhoseio
import pymongo
import time
from datetime import date
from clientDF import DiffbotClient

action = input("Do you want to make a new request?(Y/N): ")

# List with urls for the diffbot API
urlList = []

if action == 'Y':
    # configuring webhose request
    webhoseio.config(token="4057ff96-3ff1-4982-8c99-41d708f980ef")
    # query = "politics language:english thread.country:GB performance_score:>5"
    query = "Climate Change"
    query_params = {
        "q": "Climate Change",
        "ts": "1518278227788",
        "sort": "crawled"
    }

    output = webhoseio.query("filterWebContent", query_params)

    # getting the urls of the websites that matched our query/params
    # saving the urls to a file for verification
    outputFilename = input("Enter the name of the file which will contain the webhose urls: ")
    with open(outputFilename, 'w') as urlsOut:
        urlsOut.write("Query used: "+query+"\n\n")
        j = 0
        while output['posts']:
Пример #19
0
from django.shortcuts import render, redirect, HttpResponse
from django.urls import reverse
from django.contrib import messages
from django.http import JsonResponse
import random
import string
# Create your views here.
from .models import Cart, Order
from ..LoginAndRegister.models import User
import webhoseio

webhoseio.config(token='ba97ec58-a132-4dad-aa67-28e170c1c0d6')


def dashboard(request):

    # check if their is a user_id, basically checking if a user is logged in
    if 'user_id' not in request.session:
        return redirect(reverse("userLG:login"))

    # remember some information about the search history
    if 'productInfo' not in request.session:
        request.session['productInfo'] = {
            'categories': 'shirt',
            'price': '60',
            'brand': 'nike'
        }

    # those information was used to help the user be able to click on the suggestion product and be able to show the correct information about that specific product. We delete it here becuase we do not need it in this page.
    if "product" in request.session:
        del request.session["product"]
import webhoseio
from collections import Counter
webhoseio.config(token="7f568fb7-7400-47c9-9213-7b074c590dc0")
query_params = {
    "q":
    "EternalBlue OR WannaCrypt OR ShadowBroker OR EquationAPT OR Lazarus OR LazarusGroup OR Lazarus-Group OR Erebus OR Ransomware OR GoldenEye OR CryptoLocker OR Locky OR Petya OR zCrypt OR PowerWare OR HydraCrypt OR Cerber OR CryptoWall OR CVE",
    "ts": "1479589849479"
}
output = webhoseio.query("productFilter", query_params)
print(output)
print(output["darkposts"][0]["text"])  # Print the text of the first post
print(output["darkposts"][0]["title"])  # Print the title of the first post

fname = "site_darkweb.txt"
# Get the next batch of posts

output = webhoseio.get_next()

site_freq = Counter()

i = 0
# Print the site of the first post
while output:
    site_freq.update(output['darkposts'][i]['source']['site'])
    i = i + 1

with open(fname, "r") as f:
    for k, v in c.most_common():
        f.write("{},{}\n".format(k, v))
Пример #21
0
import webhoseio
import json
import time

webhoseio.config(token="e0e8b2b9-80ce-4cdc-bde9-b730ada6c3f9")

f1 = open('cities_code_latlng.csv')

locs = []
i = 0
for l in f1:
    #print(l)
    if i == 0:
        i = 1
        continue
    ls = l.strip().split(',')
    #print(ls)
    #print(ls[0])
    #print(ls[2])
    locs.append((ls[1], ls[2], ls[3], ls[4]))


#print(locs[0])
def get_data(place, country, lat, lng):
    print(place, country)
    query_params = {
        "q":
        "language:english site_type:news thread.country:{} location:{}".format(
            country, place),
        "sort":
        "relevancy",
Пример #22
0
 import webhoseio

    webhoseio.config(token="50e8f8f7-33f3-4cfe-8b39-d5ae04c44b6b")
    query_params = {
	"q": "(\"West Texas Detention Facility\" OR \"Southwest Key Casa Padre\" OR \"West Texas Detention Facility\" OR \"Southwest Key Casa Padre\" OR \"Southwest Key San Benito/Casa Antigua\" OR \"Southwest Key Combes\" OR \"Southwest Key El Cajon\" OR \"Southwest Key Estrella del Norte\" OR \"Southwest Key Youngtown - Hacienda del Sol\" OR \"Southwest Key Glendale\" OR \"Southwest Key Conroe\" OR \"Joe Corley Detention Facility \" OR \"Southwest Key Casa Quetzal\" OR \"Southwest Key Casa Blanca\" OR \"Southwest Key Casa Montezuma\" OR \"Southwest Key Casa El Presidente\" OR \"Southwest Key Casa Grande\" OR \"Southwest Key Casa Franklin\" OR \"Southwest Key Nueva Esperanza\" OR \"Southwest Key Casa Houston Reliant\" OR \"Southwest Key Shelter Care\" OR \"Southwest Key Casita del Valle\" OR \"Southwest Key Pleasant Hill\" OR \"Southwest Key Las Palmas\" OR \"Southwest Key Lemon Grove\" OR \"Southwest Key La Esperanza Home for Boys\" OR \"Southwest Key Campbell\" OR \"Shiloh Treatment Center\" OR \"Shenendoah Valley Juvenile Center\" OR \"La Salle County Regional Detention Center\" OR \"Alice Peterson Residence \" OR \"Dorothy Mitchell Residence \" OR \"Msgr. Bryan Walsh's Children's Village \" OR \"Board of Childcare\" OR \"Brazoria County Youth Homes\" OR \"Galveston Multicultural Institute\" OR \"The Pelican Island Center\" OR \"Bethany Children's Home\" OR \"Yolo County Juvenile Detention Center\" OR \"Children's Home of Kingston\" OR \"Leake and Watts Passages of Hope\" OR \"Griffin Home Residential Programs - Maceachern House\" OR \"His House Children's Home\" OR \"Homestead Temporary Shelter for Unaccompanied Children\" OR \"Holy Family Institute\" OR \"Inwood House/The Children's Village\" OR \"McAllen Station\" OR \"Lincoln Hall\" OR \"Bokenkamp Children's Shelter\" OR \"Morrison Paso Staff Secure\" OR \"Morrison Knott Street\" OR \"Nuevo Amanecer Latino Children's Services\" OR \"Neighbor to Family\" ) site_type:news thread.country:US language:english",
	"ts": "1527911096744",
	"sort": "crawled"
    }
    output = webhoseio.query("filterWebContent", query_params)
    print output['posts'][0]['text'] # Print the text of the first post
    print output['posts'][0]['published'] # Print the text of the first post publication date

    
# Get the next batch of posts

    output = webhoseio.get_next()

    
# Print the site of the first post

    print output['posts'][0]['thread']['site']
Пример #23
0
import requests
import pandas as pd
from bs4 import BeautifulSoup as soup
import webhoseio  # news API
from pinance import Pinance  # Stock API
webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXXXXXXXXX")
keywords = [
    '$', 'clinical', 'trial', 'phase', 'investment', 'position', 'price',
    'buy', 'sell'
]
# my_url = 'http://topforeignstocks.com/stock-lists/the-complete-list-of-biotech-stocks-trading-on-nasdaq/' # List of biotech companies (outdated source)
# uClient = requests.get(my_url) #downloads webpage
# page_html = uClient.text
# page_soup = soup(page_html, "html.parser")
# bio_tech_companies = page_soup.findAll("td",{"class":"column-2"}) # specific to name
# bio_tech_companies_symbol = page_soup.findAll("td",{"class":"column-3"}) # specific to symbol
NASDAQ_File = pd.read_csv('NASDAQ_Company_List.csv')
Industry = NASDAQ_File['Industry']
Symbol = NASDAQ_File['Symbol']
Name = NASDAQ_File['Name']
for i in range(len(Industry)):
    if 'Biotechnology' in str(Industry[i]) or 'Pharmaceuticals' in str(
            Industry[i]):
        query = str(Name[i])  # gets name of biotech company
        query_symbol = str(Symbol[i])  # gets symbol of biotech company
        stock = Pinance(query_symbol)
        stock.get_quotes()
        try:  # necessary if stock.get_quotes returns an empty or incomplete dictionary
            if stock.quotes_data['tradeable'] == True:
                stock_price = ((stock.quotes_data['regularMarketPrice']))
                if stock_price < 35.00:
import webhoseio
import logging
import config

logger = logging.getLogger("uniLogger")

webhoseio.config(token=config.WEB_HOSE_TOKEN)
query_params = {
    "q": "title:\"big data\" -text:\"big data\" language:english",
    "sort": "crawled"
}
output = webhoseio.query("filterWebContent", query_params)
print(output['posts'][0]['text'])  # Print the text of the first post
# Print the text of the first post publication date
print(output['posts'][0]['published'])

# Get the next batch of posts

output = webhoseio.get_next()
if len(output['posts']) == 0:
    exit(0)

# Print the site of the first post
print(output['posts'][0]['thread']['site'])


# doesn't really get the kinds of articles I'm looking for.
def scrape_japanese_cuisine():
    query_params = {
        "q": "site_category:japanese_cuisine language:english site_type:blogs",
        "sort": "crawled"
Пример #25
0
import json
import webhoseio

webhoseio.config(token="160a0e9e-b47e-4229-bf40-d7f1e2d5bf35")

query_params = {
    "q": "site:nytimes.com language:english site_type:news spam_score:0",
    "ts": "1504680060834",
    "sort": "crawled"
}

output = webhoseio.query("filterWebContent", query_params)
e = []
count = 0
for i in range(50):
    print(count)
    print(len(output['posts']))
    for x in output['posts']:
        if x['title']:
            count += 1
            e.append({'title': x['title'], 'text': x['text'], 'label': 'real'})

    #time.sleep(300)
    output = webhoseio.get_next()

with open('web75.json', 'w') as outfile:
    json.dump(e, outfile)
print("success")
Пример #26
0
 def config_webhoseio(self):
     webhoseio.config(token="ecd3d983-093a-4d8d-a7bd-71207dad85a9")
Пример #27
0
import webhoseio

webhoseio.config(token="8d786df0-7885-4818-a4d2-0562b9507f1f")
query_params = {
    "q":
    "Urovant Sciences language:english site_type:news thread.published:>1535778000000",
    "sort": "crawled"
}
output = webhoseio.query("filterWebContent", query_params)
print(len(output))
print(output['posts'][0]['text'])  # Print the text of the first post
print(output['posts'][0]
      ['published'])  # Print the text of the first post publication date
print(output)

# Get the next batch of posts

output = webhoseio.get_next()
print(output)
# Print the site of the first post

print(output['posts'][0])
Пример #28
0
#! /usr/bin/env python3

#from tinydb import TinyDB, Query
import json
import webhoseio

webhoseio.config(token='11a5bf53-12f6-440d-a84f-e42c18c7c38d')
output = webhoseio.query("filterWebContent", {
    "q": "Global Warming",
    "sort": "relevancy"
})
print("URL: " + output['posts'][0]['url'])  # Print the text of the first post
print("title: " +
      output['posts'][0]['title'])  # Print the text of the first post
print("published: " + output['posts'][0]['published']
      )  # Print the text of the first post publication date

output = webhoseio.get_next()
print("URL: " + output['posts'][0]['url'])  # Print the text of the first post
print("title: " +
      output['posts'][0]['title'])  # Print the text of the first post
print("published: " + output['posts'][0]['published']
      )  # Print the text of the first post publication date

# try:
# 	response = urlopen(request)
# 	data = response.read()
# 	parsed_json = json.loads(data)
# except URLError, e:
#    print 'API call not working. Got an error code:', e
# else:
Пример #29
0
    def update(self):
        crawledFrom = self.last_updated.timestamp()
        if abs(self.last_updated - self.last_modified) < timedelta(seconds=1):
            crawledFrom = (timezone.now() - timedelta(days=3)).timestamp()
        crawledFrom = int(crawledFrom*1000)
        
        webhoseio.config(token='e187b1d6-59c5-4b3b-9614-1c42b3e3658e')
        output = webhoseio.query(
            "filterWebContent", 
            {
                "q": self.query,
                "ts": crawledFrom,
                "language": "english",
                "site_type": "news",
            })
        
        output = output['posts']
        while True:
            temp = webhoseio.get_next()
            output += temp['posts']
            if temp['moreResultsAvailable'] <= 0:
                break

        previous_posts_uuid = []
        previous_posts_title = []
        
        if len(output) > 0:
            previous_posts_uuid = [post.uuid for post in Post.objects.all()]
            previous_posts_title = [post.title.lower() for post in Post.objects.all()]

        for post in output:
            if post['thread']['uuid'] in previous_posts_uuid:
                old_post = Post.objects.get(uuid = post['thread']['uuid'])
                if self not in old_post.trackers.all():
                    old_post.trackers.add(self)
            
            elif post['thread']['title'].lower() in previous_posts_title:
                old_post = Post.objects.get(title__iexact = post['thread']['title'])
                if self not in old_post.trackers.all():
                    old_post.trackers.add(self)

            else:
                try:
                    new_post = Post(
                        uuid = post['thread']['uuid'],
                        url = post['thread']['url'],
                        site_full = post['thread']['site_full'],
                        site_categories = post['thread']['site_categories'],
                        title = post['thread']['title'][:1024],
                        published = post['thread']['published'],
                        site_type = post['thread']['site_type'],
                        country = post['thread']['country'],
                        main_image = post['thread']['main_image'],
                        performance_score = post['thread']['performance_score'],
                        domain_rank = post['thread']['domain_rank'],
                        author = post['author'],
                        text = post['text'],
                        language = post['language'],
                        entities = post['entities'],
                        social = post['thread']['social'],
                    )

                    new_post.save()
                    new_post.trackers.add(self)
                    
                    previous_posts_uuid.append(post['thread']['uuid'])
                    previous_posts_title.append(post['thread']['title'].lower())
                
                except DataError as err:
                    print("Error: %s"%(err))
                    print(post)

        self.last_updated = timezone.now()
        self.save()
        
        return True
# Enter Webhose_key before proceeding!


## Takes text data from CNN and Fox News and compares the output of each using
# Natural Language Processing word frequency statistics

import webhoseio
import pandas as pd
import scattertext as st
import spacy.en
from scattertext import CorpusFromPandas, produce_scattertext_explorer, word_similarity_explorer
from IPython.display import IFrame

webhose_key = # Enter Webhose Key Here

webhoseio.config(token=webhose_key)

def get_headlines(search_term, site):
	query_params = {
		"q": search_term + " site:" + site + ".com language:english",
		"sort": "published"
	    }
	output = webhoseio.query("filterWebContent", query_params)
	print('[-] creating ' + site + '_output.txt')
	file = open(site + '_output.txt','w') 
	try:
		for x in range(100):
			file.write(output['posts'][x]['text'])
	except IndexError:
		print('[-] Warning: less than 100 results')
	file.close()
Пример #31
0
    with open("./webhose_results.json", 'w') as outfile:
        json.dump(output, outfile, sort_keys=True)

    insertToDB(output["posts"])
    ReqNumber = 1
    while (output["moreResultsAvailable"]):
        output = webhoseio.get_next()
        # do something for subsequent query results
        with open("./webhose_results_" + str(ReqNumber) + ".json",
                  'w') as outfile:
            json.dump(output, outfile, sort_keys=True)
        insertToDB(output["posts"])
        ReqNumber = ReqNumber + 1
        if (ReqNumber >= 5):
            break


def getStockList(filename):
    df_stock_list = pd.read_csv(filename)
    return df_stock_list


webhoseio.config(token=getToken())

df_companylist = pd.read_csv("/home/kasun/Documents/CompanyList.csv")
for name in df_companylist['CompanyName']:
    name = name.strip()
    name = name.replace(" ", "%20")
    Qparams = setQueryParams(name, 'english', '', '')
    print(Qparams)
    getContent(Qparams)