def get_pages_into_json(domain, n=1):
    domain = domain
    num_pages = n

    webhoseio.config(token="a64af0cc-bb64-44dd-a56d-d1d1e06b287e")
    query_params = {
        "q": "language:english",
        "ts": "1512637551646",
        "sort": "crawled"
    }

    output = webhoseio.query("filterWebContent", query_params)

    newpath = file_path + '/{}'.format('20171214')

    if not os.path.exists(newpath):
        os.makedirs(newpath)

    with open(newpath + '/data_1.json', 'w') as outfile:
        json.dump(output, outfile)

    for p in range(2, num_pages + 1):
        output = webhoseio.get_next()
        with open(newpath + '/data_{}.json'.format(p), 'w') as outfile:
            json.dump(output, outfile)
Exemplo n.º 2
0
def query(start_time, end_time, keywords, entities):
    q_str = " OR ".join(keywords)
    #"published:>" + dt_to_posix(start_time) + " published:<" + dt_to_posix(end_time) + \
    q = " domain_rank:<10000 site_type:news language:english title:(" + q_str + ")" + \
        " site_category:(business OR jobs OR financial_news OR international_news OR internet_technology OR investing OR investors_and_patents OR law_government_and_politics OR legal_issues OR national_news OR finance OR stocks OR tech)"

    params = {
        "q":q,
        "format":"json",
        "ts": str(start_time)
    }

    output = webhoseio.query("filterWebContent", params)

    n = output['totalResults']
    print("TOTAL RESULTS: " + str(n))
    print("REQUESTS REMAINING: " + str(output['requestsLeft']))

    """
    if not os.path.isdir("data/articles/" + dirname):
        os.mkdir("data/articles/" + dirname)

    json.dump(output, open("data/articles/" + dirname + "/0.json", "w"))
    """

    articles = parse_and_update(entities, output)

    for i in range(1, ceil(n/100.0)):
        output = webhoseio.get_next()
        articles += parse_and_update(entities, output)
        #json.dump(output, open("data/articles/" + dirname + "/" + str(i) + ".json", "w"))

    return articles
Exemplo n.º 3
0
def product_page(request, product_id, product_brand):
    if 'user_id' not in request.session:
        return redirect(reverse("userLG:login"))

    product_list = {}

    if 'product_id' not in request.session:
        print("product_id Initialized <<<<<<<-------")
        request.session['product_id'] = None

    if request.session['product_id'] != str(
            product_id) or 'product' not in request.session:
        request.session['product_id'] = product_id

        print("data from request <<<<<-------")
        query_params = {"q": "product_id: " + product_id + "", 'size': '1'}

        output = webhoseio.query("productFilter", query_params)
        product_list = {
            'product_name': output['products'][0]['name'],
            'product_brand': output['products'][0]['brand'],
            'product_price': output['products'][0]['price'],
            'product_image': output['products'][0]['images'][0],
            'product_description': output['products'][0]['description']
        }
        request.session['product'] = product_list
        # Get the next batch of products
        output = webhoseio.get_next()

        # changing the brand filter in the session
        request.session['productInfo']['brand'] = product_brand
        request.session.modified = True

        suggestion_list = sendingRequest(
            request,
            catergories=request.session['productInfo']['categories'],
            brand=product_brand,
            product_id=product_id)
        request.session["suggested_product"] = suggestion_list
    else:
        print("data from session <<<<<-------")
        product_list = request.session['product']
        suggestion_list = request.session["suggested_product"]

    itemsInCart = Cart.objects.all().count()

    return render(
        request, "ecommerce/productPage.html", {
            'product_list': product_list,
            'suggested_product': suggestion_list,
            'itemsInCart': itemsInCart
        })
Exemplo n.º 4
0
def sendingRequest(request,
                   brand='nike',
                   catergories="sport shirt",
                   price_range=50,
                   product_id=None):
    product_list = []

    if product_id is not None:
        print("There is a product Id <<<<<<----------")
        query_params = {
            "q": "name:(" + catergories + ") brand:" + brand + " ",
            'size': '5'
        }
    else:
        print(
            "Products more diverse, because, not requesting by product_id <<<<<<----------"
        )
        query_params = {
            "q":
            "name:(" + catergories + ") price: <" + str(price_range) +
            " brand:" + brand + " ",
            "size":
            "25"
        }

    try:
        output = webhoseio.query("productFilter", query_params)
    except IndexError:
        print("Not found <<<<<<<<<<----------")

    for key, value in output.items():
        if key == 'products':
            for index in value:
                if len(index['images']) < 1:
                    continue
                else:
                    product_list.append({
                        'product_price': index['price'],
                        'product_image': index['images'][0],
                        'product_id': index['product_id'],
                        'product_brand': index['brand']
                    })

    # Get the next batch of products
    output = webhoseio.get_next()

    if len(product_list) < 1:

        return HttpResponse(
            "<h4 class='text-center text-white bg-dark p-3 mt-5 shadow'>Items Not Found!!</h4>"
        )
    return (product_list)
Exemplo n.º 5
0
def api_df(token, site_lists, time_delta, filename):
    """
    A pipeline from Webhose API to CSV.

    :param token:
        api token for Webhose API.
    :param site_lists:
        list of sites we need to crawl.
    :param time_delta:
        time window. Ex: -3 means the most recent 3 days. Can only be from -1 to -30.
    :param filename:
        filename of CSV.
    :return:
        None
    """
    webhoseio.config(token=token)
    query_params = get_query(site_lists, time_delta)
    output_init = webhoseio.query("filterWebContent", query_params)
    output_flat = pd.io.json.json_normalize(output_init['posts'])
    df = output_flat[[
        'thread.uuid', 'author', 'external_links', 'published', 'text',
        'thread.site_full', 'thread.site_categories', 'thread.site_section',
        'thread.section_title', 'thread.main_image',
        'thread.social.facebook.comments', 'thread.social.facebook.likes',
        'thread.social.facebook.shares', 'title', 'url'
    ]]
    output = webhoseio.get_next()
    while len(output['posts']) > 0:
        df = output_to_df(output, df)
        try:
            output = webhoseio.get_next()
        except HTTPError:
            return df
            # df.to_csv(filename, index=False)
        if len(df) % 1000 == 0:
            print(str(len(df)) + ' has finished')
    return df
Exemplo n.º 6
0
def getContent(query_params):
    output = webhoseio.query("filterWebContent", query_params)
    print(output)
    with open("./webhose_results.json", 'w') as outfile:
        json.dump(output, outfile, sort_keys=True)

    insertToDB(output["posts"])
    ReqNumber = 1
    while (output["moreResultsAvailable"]):
        output = webhoseio.get_next()
        # do something for subsequent query results
        with open("./webhose_results_" + str(ReqNumber) + ".json",
                  'w') as outfile:
            json.dump(output, outfile, sort_keys=True)
        insertToDB(output["posts"])
        ReqNumber = ReqNumber + 1
        if (ReqNumber >= 5):
            break
Exemplo n.º 7
0
def main():
    global output
    qn = input('What do you want to ask?')

    tokens = word_tokenize(qn)
    Tokens = []
    for token in tokens:
        if token.lower() not in sw:
            Tokens.append(token)
    qnF = ' '.join(Tokens)

    typeSort()

    query_params = {
        "q": qnF + " language:english site_type:"+ sorttype,
        "ts": "1526543100240",
        "sort": "crawled"
    }

    output = webhoseio.query("filterWebContent", query_params)

    firstPost = []
    if sorttype is "blogs":
        for h in output['posts']:
            if curse in output['posts']:
                continue
            else:
                firstPost.append(h)
                printArticle()
                break
    else:
        printArticle()
        output = webhoseio.get_next()

    again = input("Do you want to hear about something else?")
    for x in again.split():
        if x.lower() in agree:
            main()
        else:
            print("Good day! See you!")
            break
Exemplo n.º 8
0
def main():
    webhoseio.config(token="XXXXXXXXXXXXXXXXX"
                     )  # needs to be substituted by real webhoseio token
    query_params = {
        "q":
        "language:english has_video:false is_first:true site_type:news site:(cnn.com OR bbc.com OR reuters.com OR nbcnews.com OR foxnews.com OR washingtonpost.com OR espn.com OR tmz.com OR sportingnews.com OR usnews.com OR wsj.com OR latimes.com OR time.com OR nydailynews.com OR economist.com OR technewsworld.com OR computerworld.com OR newsmax.com OR theatlantic.com OR hollywoodreporter.com) spam_score:0.0",
        "ts": "1510212713819",
        "sort": "crawled"
    }
    #get 1st set of articles
    output = webhoseio.query("filterWebContent", query_params)

    fl_counter = 1
    while fl_counter <= 1000:
        fl_name = "file" + "_" + str(fl_counter)
        opfile = open('C:/Users/Heena/News3/' + fl_name, 'w',
                      encoding='utf-8')  #specify path to corpus folder here
        for post in output['posts']:
            uuid = post['uuid']
            url = post['url']
            site_full = post['thread']['site_full']
            site_categories = post['thread']['site_categories']
            title_full = post['thread']['title_full']
            title = post['title']
            published = post['published']
            author = post['author']
            text = post['text']

            doc = document(uuid, url, site_full, site_categories, title,
                           title_full, published, author, text)
            jsondata = json.dumps(doc.__dict__, sort_keys=True)
            opfile.write(jsondata + '\n')

        opfile.close()
        time.sleep(30)
        print("fl_counter = ", fl_counter)
        output = webhoseio.get_next()
        print("next = ", output['next'])
        fl_counter += 1
Exemplo n.º 9
0
def scrape(query, category, start_time_str, time_diff):
    print('Start scraping data from ' + start_time_str)

    query_params = {"q": query, "sort": "crawled"}

    news_list = []

    while True:
        output = webhoseio.query("filterWebContent", query_params)
        news_list = news_list + output['posts']
        output = webhoseio.get_next()

        if len(news_list) > output['totalResults'] or len(news_list) == 0:
            break

    filename = (DATA_PATH + 'News_{0}_'.format(category) +
                str(datetime.datetime.utcnow() + time_diff).replace(
                    ' ', '_').replace(':', '_') + '.json')

    with open(filename, 'w') as outfile:
        json.dump(news_list, outfile)

    print('Persisted News Article at the following location: ' + filename)
    print('{0} news articles were collected.'.format(len(news_list)))
Exemplo n.º 10
0
    output = webhoseio.query("filterWebContent", query_params)

    # getting the urls of the websites that matched our query/params
    # saving the urls to a file for verification
    outputFilename = input("Enter the name of the file which will contain the webhose urls: ")
    with open(outputFilename, 'w') as urlsOut:
        urlsOut.write("Query used: "+query+"\n\n")
        j = 0
        while output['posts']:
            i = 0
            for var in output['posts']:
                urlsOut.write(str(j)+".\n"+output['posts'][i]['url']+"\n")
                urlList.append(output['posts'][i]['url'])
                i += 1
                j += 1
            output = webhoseio.get_next()

    # Get the next batch of posts
    output = webhoseio.get_next()
elif action == 'N':
    # Reading the urls from a given file
    # !! the file must have a specific format
    fileName = input("Enter the filename which contains the urls: ")
    with open(fileName) as f:
        next(f)
        next(f)
        i = 0
        for line in f:
            if i % 2 != 0:
                urlList.append(line[:-1])
            i += 1
Exemplo n.º 11
0
    def update(self):
        crawledFrom = self.last_updated.timestamp()
        if abs(self.last_updated - self.last_modified) < timedelta(seconds=1):
            crawledFrom = (timezone.now() - timedelta(days=3)).timestamp()
        crawledFrom = int(crawledFrom*1000)
        
        webhoseio.config(token='e187b1d6-59c5-4b3b-9614-1c42b3e3658e')
        output = webhoseio.query(
            "filterWebContent", 
            {
                "q": self.query,
                "ts": crawledFrom,
                "language": "english",
                "site_type": "news",
            })
        
        output = output['posts']
        while True:
            temp = webhoseio.get_next()
            output += temp['posts']
            if temp['moreResultsAvailable'] <= 0:
                break

        previous_posts_uuid = []
        previous_posts_title = []
        
        if len(output) > 0:
            previous_posts_uuid = [post.uuid for post in Post.objects.all()]
            previous_posts_title = [post.title.lower() for post in Post.objects.all()]

        for post in output:
            if post['thread']['uuid'] in previous_posts_uuid:
                old_post = Post.objects.get(uuid = post['thread']['uuid'])
                if self not in old_post.trackers.all():
                    old_post.trackers.add(self)
            
            elif post['thread']['title'].lower() in previous_posts_title:
                old_post = Post.objects.get(title__iexact = post['thread']['title'])
                if self not in old_post.trackers.all():
                    old_post.trackers.add(self)

            else:
                try:
                    new_post = Post(
                        uuid = post['thread']['uuid'],
                        url = post['thread']['url'],
                        site_full = post['thread']['site_full'],
                        site_categories = post['thread']['site_categories'],
                        title = post['thread']['title'][:1024],
                        published = post['thread']['published'],
                        site_type = post['thread']['site_type'],
                        country = post['thread']['country'],
                        main_image = post['thread']['main_image'],
                        performance_score = post['thread']['performance_score'],
                        domain_rank = post['thread']['domain_rank'],
                        author = post['author'],
                        text = post['text'],
                        language = post['language'],
                        entities = post['entities'],
                        social = post['thread']['social'],
                    )

                    new_post.save()
                    new_post.trackers.add(self)
                    
                    previous_posts_uuid.append(post['thread']['uuid'])
                    previous_posts_title.append(post['thread']['title'].lower())
                
                except DataError as err:
                    print("Error: %s"%(err))
                    print(post)

        self.last_updated = timezone.now()
        self.save()
        
        return True
Exemplo n.º 12
0
 def getNextBatch(self):
     if (self.output is None or webhoseio is None):
         raise Exception("should request first")
     self.output = webhoseio.get_next()
     return self.output