def get_pages_into_json(domain, n=1): domain = domain num_pages = n webhoseio.config(token="a64af0cc-bb64-44dd-a56d-d1d1e06b287e") query_params = { "q": "language:english", "ts": "1512637551646", "sort": "crawled" } output = webhoseio.query("filterWebContent", query_params) newpath = file_path + '/{}'.format('20171214') if not os.path.exists(newpath): os.makedirs(newpath) with open(newpath + '/data_1.json', 'w') as outfile: json.dump(output, outfile) for p in range(2, num_pages + 1): output = webhoseio.get_next() with open(newpath + '/data_{}.json'.format(p), 'w') as outfile: json.dump(output, outfile)
def query(start_time, end_time, keywords, entities): q_str = " OR ".join(keywords) #"published:>" + dt_to_posix(start_time) + " published:<" + dt_to_posix(end_time) + \ q = " domain_rank:<10000 site_type:news language:english title:(" + q_str + ")" + \ " site_category:(business OR jobs OR financial_news OR international_news OR internet_technology OR investing OR investors_and_patents OR law_government_and_politics OR legal_issues OR national_news OR finance OR stocks OR tech)" params = { "q":q, "format":"json", "ts": str(start_time) } output = webhoseio.query("filterWebContent", params) n = output['totalResults'] print("TOTAL RESULTS: " + str(n)) print("REQUESTS REMAINING: " + str(output['requestsLeft'])) """ if not os.path.isdir("data/articles/" + dirname): os.mkdir("data/articles/" + dirname) json.dump(output, open("data/articles/" + dirname + "/0.json", "w")) """ articles = parse_and_update(entities, output) for i in range(1, ceil(n/100.0)): output = webhoseio.get_next() articles += parse_and_update(entities, output) #json.dump(output, open("data/articles/" + dirname + "/" + str(i) + ".json", "w")) return articles
def product_page(request, product_id, product_brand): if 'user_id' not in request.session: return redirect(reverse("userLG:login")) product_list = {} if 'product_id' not in request.session: print("product_id Initialized <<<<<<<-------") request.session['product_id'] = None if request.session['product_id'] != str( product_id) or 'product' not in request.session: request.session['product_id'] = product_id print("data from request <<<<<-------") query_params = {"q": "product_id: " + product_id + "", 'size': '1'} output = webhoseio.query("productFilter", query_params) product_list = { 'product_name': output['products'][0]['name'], 'product_brand': output['products'][0]['brand'], 'product_price': output['products'][0]['price'], 'product_image': output['products'][0]['images'][0], 'product_description': output['products'][0]['description'] } request.session['product'] = product_list # Get the next batch of products output = webhoseio.get_next() # changing the brand filter in the session request.session['productInfo']['brand'] = product_brand request.session.modified = True suggestion_list = sendingRequest( request, catergories=request.session['productInfo']['categories'], brand=product_brand, product_id=product_id) request.session["suggested_product"] = suggestion_list else: print("data from session <<<<<-------") product_list = request.session['product'] suggestion_list = request.session["suggested_product"] itemsInCart = Cart.objects.all().count() return render( request, "ecommerce/productPage.html", { 'product_list': product_list, 'suggested_product': suggestion_list, 'itemsInCart': itemsInCart })
def sendingRequest(request, brand='nike', catergories="sport shirt", price_range=50, product_id=None): product_list = [] if product_id is not None: print("There is a product Id <<<<<<----------") query_params = { "q": "name:(" + catergories + ") brand:" + brand + " ", 'size': '5' } else: print( "Products more diverse, because, not requesting by product_id <<<<<<----------" ) query_params = { "q": "name:(" + catergories + ") price: <" + str(price_range) + " brand:" + brand + " ", "size": "25" } try: output = webhoseio.query("productFilter", query_params) except IndexError: print("Not found <<<<<<<<<<----------") for key, value in output.items(): if key == 'products': for index in value: if len(index['images']) < 1: continue else: product_list.append({ 'product_price': index['price'], 'product_image': index['images'][0], 'product_id': index['product_id'], 'product_brand': index['brand'] }) # Get the next batch of products output = webhoseio.get_next() if len(product_list) < 1: return HttpResponse( "<h4 class='text-center text-white bg-dark p-3 mt-5 shadow'>Items Not Found!!</h4>" ) return (product_list)
def api_df(token, site_lists, time_delta, filename): """ A pipeline from Webhose API to CSV. :param token: api token for Webhose API. :param site_lists: list of sites we need to crawl. :param time_delta: time window. Ex: -3 means the most recent 3 days. Can only be from -1 to -30. :param filename: filename of CSV. :return: None """ webhoseio.config(token=token) query_params = get_query(site_lists, time_delta) output_init = webhoseio.query("filterWebContent", query_params) output_flat = pd.io.json.json_normalize(output_init['posts']) df = output_flat[[ 'thread.uuid', 'author', 'external_links', 'published', 'text', 'thread.site_full', 'thread.site_categories', 'thread.site_section', 'thread.section_title', 'thread.main_image', 'thread.social.facebook.comments', 'thread.social.facebook.likes', 'thread.social.facebook.shares', 'title', 'url' ]] output = webhoseio.get_next() while len(output['posts']) > 0: df = output_to_df(output, df) try: output = webhoseio.get_next() except HTTPError: return df # df.to_csv(filename, index=False) if len(df) % 1000 == 0: print(str(len(df)) + ' has finished') return df
def getContent(query_params): output = webhoseio.query("filterWebContent", query_params) print(output) with open("./webhose_results.json", 'w') as outfile: json.dump(output, outfile, sort_keys=True) insertToDB(output["posts"]) ReqNumber = 1 while (output["moreResultsAvailable"]): output = webhoseio.get_next() # do something for subsequent query results with open("./webhose_results_" + str(ReqNumber) + ".json", 'w') as outfile: json.dump(output, outfile, sort_keys=True) insertToDB(output["posts"]) ReqNumber = ReqNumber + 1 if (ReqNumber >= 5): break
def main(): global output qn = input('What do you want to ask?') tokens = word_tokenize(qn) Tokens = [] for token in tokens: if token.lower() not in sw: Tokens.append(token) qnF = ' '.join(Tokens) typeSort() query_params = { "q": qnF + " language:english site_type:"+ sorttype, "ts": "1526543100240", "sort": "crawled" } output = webhoseio.query("filterWebContent", query_params) firstPost = [] if sorttype is "blogs": for h in output['posts']: if curse in output['posts']: continue else: firstPost.append(h) printArticle() break else: printArticle() output = webhoseio.get_next() again = input("Do you want to hear about something else?") for x in again.split(): if x.lower() in agree: main() else: print("Good day! See you!") break
def main(): webhoseio.config(token="XXXXXXXXXXXXXXXXX" ) # needs to be substituted by real webhoseio token query_params = { "q": "language:english has_video:false is_first:true site_type:news site:(cnn.com OR bbc.com OR reuters.com OR nbcnews.com OR foxnews.com OR washingtonpost.com OR espn.com OR tmz.com OR sportingnews.com OR usnews.com OR wsj.com OR latimes.com OR time.com OR nydailynews.com OR economist.com OR technewsworld.com OR computerworld.com OR newsmax.com OR theatlantic.com OR hollywoodreporter.com) spam_score:0.0", "ts": "1510212713819", "sort": "crawled" } #get 1st set of articles output = webhoseio.query("filterWebContent", query_params) fl_counter = 1 while fl_counter <= 1000: fl_name = "file" + "_" + str(fl_counter) opfile = open('C:/Users/Heena/News3/' + fl_name, 'w', encoding='utf-8') #specify path to corpus folder here for post in output['posts']: uuid = post['uuid'] url = post['url'] site_full = post['thread']['site_full'] site_categories = post['thread']['site_categories'] title_full = post['thread']['title_full'] title = post['title'] published = post['published'] author = post['author'] text = post['text'] doc = document(uuid, url, site_full, site_categories, title, title_full, published, author, text) jsondata = json.dumps(doc.__dict__, sort_keys=True) opfile.write(jsondata + '\n') opfile.close() time.sleep(30) print("fl_counter = ", fl_counter) output = webhoseio.get_next() print("next = ", output['next']) fl_counter += 1
def scrape(query, category, start_time_str, time_diff): print('Start scraping data from ' + start_time_str) query_params = {"q": query, "sort": "crawled"} news_list = [] while True: output = webhoseio.query("filterWebContent", query_params) news_list = news_list + output['posts'] output = webhoseio.get_next() if len(news_list) > output['totalResults'] or len(news_list) == 0: break filename = (DATA_PATH + 'News_{0}_'.format(category) + str(datetime.datetime.utcnow() + time_diff).replace( ' ', '_').replace(':', '_') + '.json') with open(filename, 'w') as outfile: json.dump(news_list, outfile) print('Persisted News Article at the following location: ' + filename) print('{0} news articles were collected.'.format(len(news_list)))
output = webhoseio.query("filterWebContent", query_params) # getting the urls of the websites that matched our query/params # saving the urls to a file for verification outputFilename = input("Enter the name of the file which will contain the webhose urls: ") with open(outputFilename, 'w') as urlsOut: urlsOut.write("Query used: "+query+"\n\n") j = 0 while output['posts']: i = 0 for var in output['posts']: urlsOut.write(str(j)+".\n"+output['posts'][i]['url']+"\n") urlList.append(output['posts'][i]['url']) i += 1 j += 1 output = webhoseio.get_next() # Get the next batch of posts output = webhoseio.get_next() elif action == 'N': # Reading the urls from a given file # !! the file must have a specific format fileName = input("Enter the filename which contains the urls: ") with open(fileName) as f: next(f) next(f) i = 0 for line in f: if i % 2 != 0: urlList.append(line[:-1]) i += 1
def update(self): crawledFrom = self.last_updated.timestamp() if abs(self.last_updated - self.last_modified) < timedelta(seconds=1): crawledFrom = (timezone.now() - timedelta(days=3)).timestamp() crawledFrom = int(crawledFrom*1000) webhoseio.config(token='e187b1d6-59c5-4b3b-9614-1c42b3e3658e') output = webhoseio.query( "filterWebContent", { "q": self.query, "ts": crawledFrom, "language": "english", "site_type": "news", }) output = output['posts'] while True: temp = webhoseio.get_next() output += temp['posts'] if temp['moreResultsAvailable'] <= 0: break previous_posts_uuid = [] previous_posts_title = [] if len(output) > 0: previous_posts_uuid = [post.uuid for post in Post.objects.all()] previous_posts_title = [post.title.lower() for post in Post.objects.all()] for post in output: if post['thread']['uuid'] in previous_posts_uuid: old_post = Post.objects.get(uuid = post['thread']['uuid']) if self not in old_post.trackers.all(): old_post.trackers.add(self) elif post['thread']['title'].lower() in previous_posts_title: old_post = Post.objects.get(title__iexact = post['thread']['title']) if self not in old_post.trackers.all(): old_post.trackers.add(self) else: try: new_post = Post( uuid = post['thread']['uuid'], url = post['thread']['url'], site_full = post['thread']['site_full'], site_categories = post['thread']['site_categories'], title = post['thread']['title'][:1024], published = post['thread']['published'], site_type = post['thread']['site_type'], country = post['thread']['country'], main_image = post['thread']['main_image'], performance_score = post['thread']['performance_score'], domain_rank = post['thread']['domain_rank'], author = post['author'], text = post['text'], language = post['language'], entities = post['entities'], social = post['thread']['social'], ) new_post.save() new_post.trackers.add(self) previous_posts_uuid.append(post['thread']['uuid']) previous_posts_title.append(post['thread']['title'].lower()) except DataError as err: print("Error: %s"%(err)) print(post) self.last_updated = timezone.now() self.save() return True
def getNextBatch(self): if (self.output is None or webhoseio is None): raise Exception("should request first") self.output = webhoseio.get_next() return self.output