Exemplo n.º 1
0
def get_news_from_url():
    """ This function finds all news items from mongodb which do not have a
        full text. It then downloads full text for all such items"""

    # Parser setup
    g = Goose({'browser_user_agent': 'Mozilla'})

    total_items = db.news_data.find({
        "full_article_text": {
            "$exists": False
        }
    }).count()
    uf._printer_G('Total Items to retrive: %d' % (total_items))
    i = 1
    for d in db.news_data.find({"full_article_text": {"$exists": False}}):
        # print d
        # code.interact( local=locals() )

        uf._printer_G('---%d of %d---' % (i, total_items))
        i += 1
        startTime = time.time()
        #TODO, only print _id and uuid. As we add more alert sources in addition
        #       to google-alerts. We may or may not have titles from them. Possibly
        #       only have the urls
        print '_id        :', str(d['_id'])
        print 'uuid       :', d['uuid']
        print 'Downloading: ', d['url']
        print 'Alert on   :', d['alert_title']  #consider not printering this
        print 'news_id    :', d['news_id']  #consider not printering

        new_data = {}
        new_data['full_article_title'] = ""
        new_data['full_article_text'] = ""
        new_data['full_article_domain'] = ""
        new_data['full_article_publish_date'] = ""
        try:
            with uf.Timeout(5):
                article = g.extract(url=d['url'])
                print 'article.title       :', article.title
                print 'article.domain      :', article.domain
                # print article.cleaned_text
                print 'article.publish_date:', article.publish_date

                new_data['full_article_title'] = article.title
                new_data['full_article_text'] = article.cleaned_text
                new_data['full_article_domain'] = article.domain
                new_data['full_article_publish_date'] = article.publish_date
                # code.interact( local=locals() )
        except uf.Timeout.Timeout:
            print '[ERROR] Timeout. This item (uuid=%s) will be empty' % (
                d['uuid'])
        except:
            print 'py-goose retrival failed!'

        db.news_data.find_one_and_update({"_id": d['_id']}, {"$set": new_data})
        uf._printer_('Done in %4.2fs' % (time.time() - startTime))
input_html_file = 'google-alerts.html'
output_csv = input_html_file + '.csv'  #'feed_list_ejcbsdhansbn.csv'
uf._printer_Y('Input File  : ' + input_html_file)
uf._printer_Y('Output File : ' + output_csv)

soup = BeautifulSoup(open(input_html_file).read(), "lxml")
alerts_soup = soup.find("div", {"id": "manage-alerts-div"})
if alerts_soup is None:
    uf._error("No div with id:manage-alerts-div.\nQuit---")
    quit()

all_li = alerts_soup.findAll("li")
if len(all_li) < 1:
    uf._error("No Alerts\nQuit-----")
    quit()

fp_out = open(output_csv, 'w')

for li in all_li:
    tag_text = li.find("div", {"class": "query_div"}).get_text().strip()
    rss_url = li.find("a")['href']

    uf._printer_('---')
    uf._printer_(' tag     : ' + tag_text)
    uf._printer_G('rss_url : ' + rss_url)

    fp_out.write("%s,%s\n" % (tag_text, rss_url))
fp_out.close()

uf._printer_G("Done!")
Exemplo n.º 3
0
    print 'Downloading: ', d['url']
    print 'Alert on   :', d['alert_title']  #consider not printering this
    print 'news_id    :', d['news_id']  #consider not printering

    new_data = {}
    new_data['full_article_title'] = ""
    new_data['full_article_text'] = ""
    new_data['full_article_domain'] = ""
    new_data['full_article_publish_date'] = ""
    try:
        with uf.Timeout(5):
            article = g.extract(url=d['url'])
            print 'article.title       :', article.title
            print 'article.domain      :', article.domain
            # print article.cleaned_text
            print 'article.publish_date:', article.publish_date

            new_data['full_article_title'] = article.title
            new_data['full_article_text'] = article.cleaned_text
            new_data['full_article_domain'] = article.domain
            new_data['full_article_publish_date'] = article.publish_date
            # code.interact( local=locals() )
    except uf.Timeout.Timeout:
        print '[ERROR] Timeout. This item (uuid=%s) will be empty' % (
            d['uuid'])
    except:
        print 'py-goose retrival failed!'

    db.news_data.find_one_and_update({"_id": d['_id']}, {"$set": new_data})
    uf._printer_('Done in %4.2fs' % (time.time() - startTime))
Exemplo n.º 4
0
    ob.download_alerts()
    ob.insert_into_db(db)

    #
    # More Alert Sources (future work)
    #   As we have more sources for alerts, they will go here. These will basically
    #   put urls in mongodb in db.sun_dance.news_data. Try and have similar interface to
    #   google-alerts
    #

    #
    # URL Loop - Loop over all the items in mongodb which do not have full text of
    #   articles
    #
    uf._printer_G('+++++\n +++++ Start www crawl\n +++++')
    get_news_from_url()
    run_done_in = time.time() - startTimerun
    uf._printer_('Run#%d completed in %4.2f sec on %s' %
                 (run, run_done_in, str(datetime.now())))
    run = run + 1
    sleep_for = repeat_every_sec - run_done_in
    if sleep_for > 0:
        uf._printer_('Sleeping....zZzz for %4.2f sec' % (sleep_for))
        time.sleep(sleep_for)

    #
    # Delete Raw files
    #
    uf._printer_('rm -rf %s' % (storage_folder))
    shutil.rmtree(storage_folder)
def make_folder_if_not_exist(folder):
    if not os.path.exists(folder):
        print tcol.OKGREEN, 'Make Directory : ', folder, tcol.ENDC
        os.makedirs(folder)
    else:
        print tcol.WARNING, 'Directory already exists : Not creating :', folder, tcol.ENDC


uf._debug('Alerts DB :' + ALERTS_DB)
uf._debug('Open file : ' + CSV_FILENAME)
make_folder_if_not_exist(ALERTS_DB)

csvReader = csv.reader(open(CSV_FILENAME))
for row_i, row in enumerate(csvReader):
    uf._printer_('---' + str(row_i))
    uf._debug('row : ' + str(row), lvl=2)
    tag = row[0]
    url = row[1]

    alert_id = url.strip().split('/')[-1]
    alert_user_id = url.strip().split('/')[-2]

    uf._printer_G('alert_id=%s ; user=%s ; tag=%s' %
                  (alert_id, alert_user_id, tag))

    # Download
    uf._debug('URL:%s' % (url))
    startTime = time.time()
    response = urllib2.urlopen(url)
    html = response.read()