Пример #1
0
def _news_scraper(news_site_uid):
    host = configuration()['news_sites'][news_site_uid]['url']

    logging.info(f'Beginning scraper for {host}')

    homepage = news.HomePage(news_site_uid, host)

    articles = []
    for link in homepage.article_links:
        article = _fetch_article(news_site_uid, host, link)

        if article:
            logger.info('Article fetched!!')
            articles.append(article)
            print(article.title)

    print(len(articles))
Пример #2
0
#!/usr/bin/python
# Frodo - A web app for monitoring SGE cluster status: https://bitbucket.org/yoavram/frodo
# Copyright (c) 2012 by Yoav Ram.
# This work is licensed under the Creative Commons Attribution-ShareAlike 3.0 Unported License.
# To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/3.0/.from flask import Flask, render_template, request, session, redirect, url_for, jsonify
from flask import Flask, render_template, request, session, redirect, url_for, jsonify
import time
import common
import qstat

JOB_ID_KEY = 'jobID'

cfg = common.configuration()

app = Flask(__name__)
app.debug = cfg.getboolean('web','development')
app.secret_key = cfg.get('web','secret')

@app.route('/')
def index():
    return redirect(url_for('qstat_html'))

@app.route('/qstat')
@app.route('/qstat/jobID/<int:jobID>')
@app.route('/qstat/username/<qusername>')
def qstat_html(jobID = None, qusername=None):
    if 'username' not in session:
        return redirect(url_for('login'))
    username = session['username']
    password = session['password']
    now = time.asctime()
Пример #3
0
#!/usr/bin/python
# Frodo - A web app for monitoring SGE cluster status: https://bitbucket.org/yoavram/frodo
# Copyright (c) 2012 by Yoav Ram.
# This work is licensed under the Creative Commons Attribution-ShareAlike 3.0 Unported License.
# To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/3.0/.from flask import Flask, render_template, request, session, redirect, url_for, jsonify
from flask import Flask, render_template, request, session, redirect, url_for, jsonify
import time
import common
import qstat

JOB_ID_KEY = "jobID"

cfg = common.configuration()

app = Flask(__name__)
app.debug = cfg.getboolean("web", "development")
app.secret_key = cfg.get("web", "secret")


@app.route("/")
def index():
    return redirect(url_for("qstat_html"))


@app.route("/qstat")
@app.route("/qstat/jobID/<int:jobID>")
@app.route("/qstat/username/<qusername>")
def qstat_html(jobID=None, qusername=None):
    if "username" not in session:
        return redirect(url_for("login"))
    username = session["username"]
Пример #4
0
    except (HTTPError, MaxRetryError) as e:
        logger.warning('Error while fetching the article', exc_info=False)

    if article and not article.body:
        logger.warning('Invalid article. There is no body')
        return None

    return article


def _build_link(host, link):
    if is_well_formed_url.match(link):
        return link
    elif is_root_path.match(link):
        return f'{host}{link}'
    else:
        return f'{host}/{link}'


if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    news_site_choices = list(configuration()['news_sites'].keys())
    parser.add_argument('news_site',
                        help='The news site you that want to scrape',
                        type=str,
                        choices=news_site_choices)

    args = parser.parse_args()
    _news_scraper(args.news_site)
    def __init__(self, news_site_uid, url):
        self._config = configuration()['news_sites'][news_site_uid]
        self._queries = self._config['queries']
        self._html = None

        self._visit(url)