Exemplo n.º 1
0
    def work(self):
        """
        Look for Documents in the given session Return for which no Analysis exists and
        creates one with Status.New. Returns True iff some Analyses were created
        """
        # start a new session for each job
        session = Session()
        try:
            # Get a Document
            # ... for which no Analysis exists
            # ... and lock it for updates
            # ... sort by created date
            # ... pick the first (oldest)
            gkgs = session.query(Gkg) \
                .filter(~session.query(Analysis).filter(Gkg.id == Analysis.gkg_id).exists()) \
                .with_for_update() \
                .order_by(Gkg.date) \
                .limit(1000).all()
            if len(gkgs) == 0:
                return False  # no work to be done
            for gkg in gkgs:
                analysis = Analysis(gkg=gkg, status=Status.NEW)
                session.add(analysis)
                session.commit()
                logger.info(
                    "Worker {} created Analysis {} in status {}".format(
                        os.getpid(), analysis.gkg_id, analysis.status))
        finally:
            # make sure to release a FOR UPDATE lock, if we got one
            if session is not None:
                session.rollback()
                session.close()

        return True
Exemplo n.º 2
0
    def work(self):
        """
        Look for analyses in the given session and run function on them
        if any are found, managing status appropriately. Return True iff some Analyses were processed (successfully or not)
        """
        # start a new session for each job
        session = Session()
        try:
            # Get an analysis
            # ... and lock it for updates
            # ... that meets the conditions specified in the filter function
            # ... sort by updated date
            # ... pick the first (oldest)
            analysis = self.filter_function(session.query(Analysis)) \
                .with_for_update() \
                .order_by(Analysis.updated) \
                .first()
            if analysis is None:
                return False  # no work to be done
            analysis_status = analysis.status
            analysis.create_new_version(self.working_status)
            logger.info("Worker {} claimed Analysis {} in status {}".format(
                os.getpid(), analysis.gkg_id, analysis_status))
        finally:
            # make sure to release a FOR UPDATE lock, if we got one
            session.rollback()

        start = time.time()
        try:
            # set a timeout so if this worker stalls, we recover
            signal.alarm(self.timeout_seconds)
            # actually run the work function on this analysis
            self.function(analysis)
            delta = time.time() - start
            logger.info("Worker {} processed Analysis {} {} -> {} {}s".format(
                os.getpid(), analysis.gkg_id, analysis_status,
                self.success_status, delta))
            analysis.error_msg = None
            analysis.processing_time = delta
            analysis.create_new_version(self.success_status)
        except Exception as e:
            delta = time.time() - start
            logger.warning(
                "Worker {} failed to process Analysis {} {} -> {}".format(
                    os.getpid(), analysis.gkg_id, analysis_status,
                    self.failure_status),
                exc_info=e)
            analysis.error_msg = str(e)
            analysis.processing_time = delta
            analysis.create_new_version(self.failure_status)
            session.commit()
        finally:
            # clear the timeout
            signal.alarm(0)
            if session is not None:
                session.rollback()
                session.close()
        return True
Exemplo n.º 3
0
def map_week_mview():
    session = Session()
    try:
        entries = get_map_week(session)
        resp = jsonify(entries)
        resp.status_code = 200
        return resp
    finally:
        session.close()
Exemplo n.º 4
0
def wordcloud():
    session = Session()
    try:
        data = request.get_json(silent=True) or request.form
        filters = filter_params(data)
        result = get_wordcloud(session, engine, **filters)
        resp = jsonify(result)
        resp.status_code = 200
        return resp
    finally:
        session.close()
Exemplo n.º 5
0
def histogram():
    session = Session()
    try:
        data = request.get_json(silent=True) or request.form
        filters = filter_params(data)
        result = get_histogram_counts(session, **filters)
        resp = jsonify(result)
        resp.status_code = 200
        return resp
    finally:
        session.close()
Exemplo n.º 6
0
def homepage():
    session = Session()
    try:
        articles = session.query(Analysis).order_by(desc(
            Analysis.updated)).limit(10).all()
        counts = Analysis.status_counts(session)
        cat_counts = Analysis.category_counts(session)
        return render_template('index.html',
                               articles=articles,
                               counts=counts,
                               cat_counts=cat_counts)
    finally:
        session.close()
Exemplo n.º 7
0
def urllist():
    session = Session()
    try:
        data = request.get_json(silent=True) or request.form
        filters = filter_params(data)
        limit = data.get('limit', 32)
        offset = data.get('offset', 0)
        entries = get_urllist(session, limit=limit, offset=offset, **filters)
        count = get_count(session, **filters)
        resp = jsonify({'entries': entries, 'nentries': count})
        resp.status_code = 200
        return resp
    finally:
        session.close()
Exemplo n.º 8
0
def article(doc_id):
    session = Session()
    try:
        analysis = session.query(Analysis) \
            .filter(Analysis.gkg_id == doc_id).one()
        coords = {
            tuple(l.latlong.split(","))
            for f in analysis.facts for l in f.locations
            if l.latlong is not None
        }
        return render_template('article.html',
                               article=analysis,
                               coords=list(coords))
    finally:
        session.close()
Exemplo n.º 9
0
def add_url():
    url = request.form['url']
    logger.info("Scraping by url: {url}".format(url=url))
    if url is None:
        flash(u'Something went wrong. Please try again.', 'danger')
        return redirect(url_for('/'))
    article = Gkg(document_identifier=url)
    session = Session()
    try:
        session.add(article)
        session.commit()
        flash(u"{} was successfully added".format(url), 'success')
        return redirect('/')
    finally:
        session.close()
Exemplo n.º 10
0
def search_url():
    url = request.args.get('url')
    if url is None:
        return json.dumps({'success': False}), 422, {
            'ContentType': 'application/json'
        }
    session = Session()
    try:
        gkg = session.query(Gkg).filter(
            Gkg.document_identifier.like("%" + url + "%")).order_by(
                Gkg.date.desc()).first()
        if gkg:
            resp = jsonify({'doc_id': gkg.id})
            resp.status_code = 200
            return resp
        else:
            return json.dumps({'success': False}), 422, {
                'ContentType': 'application/json'
            }
    finally:
        session.close()
Exemplo n.º 11
0
def urllist_grouped():
    session = Session()
    try:
        data = request.get_json(silent=True) or request.form
        filters = filter_params(data)
        limit = data.get('limit', 32)
        offset = data.get('offset', 0)
        entries = get_urllist_grouped(session,
                                      limit=limit,
                                      offset=offset,
                                      **filters)
        # TODO for url_list grouped count should be the number of groups rather than the number of entries
        factcount = get_count(session, **filters)
        groupcount = get_group_count(session, **filters)
        resp = jsonify({
            'groups': entries,
            'ngroups': groupcount,
            'tot_nfacts': factcount
        })
        resp.status_code = 200
        return resp
    finally:
        session.close()
Exemplo n.º 12
0
import string
import numpy as np
import pandas as pd
from idetect.nlp_models.category import *
from idetect.nlp_models.relevance import *
from idetect.nlp_models.base_model import CustomSklLsiModel

if __name__ == "__main__":

    # Create the Database
    engine = create_engine(db_url())
    Session.configure(bind=engine)
    Base.metadata.create_all(engine)

    session = Session()
    # Load the Countries data if necessary
    countries = session.query(Country).all()
    if len(countries) == 0:
        load_countries(session)

    # Load the Keywords if neccessary
    keywords = session.query(FactKeyword).all()
    if len(keywords) == 0:
        load_terms(session)

    session.close()

    # Load the Classifier models once to ensure they are downloaded
    CategoryModel()
    RelevanceModel()
Exemplo n.º 13
0
def analyse_url():
    session = Session()
    status = None
    gkg_id = None
    try:
        url = request.get_json(silent=True)['url'] or request.form['url']
    except Exception as e:
        return json.dumps({
            'success': False,
            'Exception': str(e),
            'status': 'missing or null url parameter'
        }), 422, {
            'ContentType': 'application/json'
        }
    if url is None:
        return json.dumps({
            'success': False,
            'status': 'null url parameter'
        }), 422, {
            'ContentType': 'application/json'
        }
    gkg = session.query(Gkg.id).filter(
        Gkg.document_identifier.like("%" + url + "%")).order_by(
            Gkg.date.asc()).first()
    if gkg:
        gkg_id = gkg.id
        status = 'url already in IDETECT DB'
    else:
        analysis = create_new_analysis_from_url(session, url)
        gkg_id = analysis.gkg_id
        status = 'url added to IDETECT DB'
        try:
            work(session, analysis, Status.SCRAPING, Status.SCRAPED,
                 Status.SCRAPING_FAILED, scrape)
            # TODO add classification, missing modules
            # work(session,analysis,Status.CLASSIFYING,Status.CLASSIFIED,Status.CLASSIFYING_FAILED,lambda article: classify(article, get_c_m(), get_r_m()))
            work(session, analysis, Status.EXTRACTING, Status.EXTRACTED,
                 Status.EXTRACTING_FAILED, extract_facts)
            work(session, analysis, Status.GEOTAGGING, Status.GEOTAGGED,
                 Status.GEOTAGGING_FAILED, process_locations)
        except Exception as e:
            return json.dumps({
                'success': False,
                'Exception': str(e)
            }), 422, {
                'ContentType': 'application/json'
            }
        finally:
            session.close()
    try:
        document = get_document(session, gkg_id)
        entries = get_facts_for_document(session, gkg_id)
        resp = jsonify({
            'document': document,
            'facts': entries,
            'status': status
        })
        resp.status_code = 200
        return resp
    finally:
        session.close()