Exemplo n.º 1
0
def fetch_max_t_of_prev_trading_day(session: Session, ric: str,
                                    t: datetime) -> int:

    assert (t.tzinfo == UTC)

    prev_day = t + timedelta(hours=9) - timedelta(days=1)

    return session \
        .query(extract('epoch', func.max(Price.t))) \
        .filter(cast(in_jst(Price.t), Date) <= prev_day.date(), Price.ric == ric) \
        .scalar()
Exemplo n.º 2
0
def fetch_prices_of_a_day(session: Session, ric: str,
                          jst: datetime) -> List[Tuple[datetime, Decimal]]:
    results = session \
        .query(func.to_char(in_utc(Price.t), 'YYYY-MM-DD HH24:MI:SS').label('t'),
               Price.val) \
        .filter(cast(in_jst(Price.t), Date) == jst.date(), Price.ric == ric) \
        .order_by(Price.t) \
        .all()

    return [(datetime.strptime(r.t,
                               '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), r.val)
            for r in results]
Exemplo n.º 3
0
def insert_headlines(session: Session, dir_nikkei_headline: Path,
                     train_span: Span, valid_span: Span, test_span: Span,
                     logger: Logger) -> None:

    dests = list(dir_nikkei_headline.glob('*.csv.gz')) + list(
        dir_nikkei_headline.glob('*.csv'))
    for dest in dests:
        with gzip.open(str(dest),
                       mode='rt') if dest.suffix == '.gz' else dest.open(
                           mode='r') as f:

            N = sum(1 for _ in f) - 1
            f.seek(0)
            reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_ALL)
            next(reader)
            fields = next(reader)
            t = fields[1]
            if 'Z' not in t or '+' not in t:
                t = t + '+0000'
            t = datetime.strptime(t, NIKKEI_DATETIME_FORMAT).astimezone(JST)
            first = session \
                .query(Headline) \
                .filter(extract('year', in_jst(Headline.t)) == t.year) \
                .first()
            if first is not None:
                return

            logger.info('start {}'.format(f.name))

            f.seek(0)
            next(reader)
            headlines = []
            for _ in tqdm(range(N)):
                fields = next(reader)
                t = fields[1]
                if 'Z' not in t or '+' not in t:
                    t = t + '+0000'
                article_id = fields[5]
                headline = fields[6]
                isins = None if fields[25] == '' else fields[25].split(':')
                countries = None if fields[36] == '' else fields[36].split(':')
                categories = None if fields[37] == '' else fields[37].split(
                    ':')
                keywords_headline = None if fields[-2] == '' else fields[
                    -2].split(':')
                keywords_article = None if fields[-1] == '' else fields[
                    -1].split(':')
                try:
                    t = datetime.strptime(t, NIKKEI_DATETIME_FORMAT)
                except ValueError:
                    message = 'ValueError: {}, {}, {}'
                    logger.info(message.format(t, article_id, headline))
                    continue

                if train_span.start <= t and t < train_span.end:
                    phase = Phase.Train.value
                elif valid_span.start <= t and t < valid_span.end:
                    phase = Phase.Valid.value
                elif test_span.start <= t and t < test_span.end:
                    phase = Phase.Test.value
                else:
                    phase = None

                headlines.append({
                    'article_id': article_id,
                    't': t,
                    'headline': headline,
                    'isins': isins,
                    'countries': countries,
                    'categories': categories,
                    'keywords_headline': keywords_headline,
                    'keywords_article': keywords_article,
                    'is_used': None,
                    'phase': phase
                })

            session.execute(Headline.__table__.insert(), headlines)
            session.commit()
Exemplo n.º 4
0
def load_alignments_from_db(session: Session, phase: Phase,
                            logger: Logger) -> List[Alignment]:

    headlines = session \
        .query(Headline.article_id,
               Headline.tag_tokens,
               Headline.t,
               cast(extract('epoch', Headline.t), Integer).label('unixtime'),
               cast(extract('hour', in_jst(Headline.t)), Integer).label('jst_hour')) \
        .filter(Headline.is_used.is_(True), Headline.phase == phase.value) \
        .order_by(Headline.t) \
        .all()
    headlines = list(headlines)

    rics = fetch_rics(session)

    alignments = []
    seqtypes = [
        SeqType.RawShort, SeqType.RawLong, SeqType.MovRefShort,
        SeqType.MovRefLong, SeqType.NormMovRefShort, SeqType.NormMovRefLong,
        SeqType.StdShort, SeqType.StdLong
    ]
    logger.info(
        'start creating alignments between headlines and price sequences.')

    for h in tqdm(headlines):

        # Find the latest prices before the article is published
        chart = dict([
            fetch_latest_vals(session, h.t, ric, seqtype)
            for (ric, seqtype) in itertools.product(rics, seqtypes)
        ])

        # Replace tags with price tags
        tag_tokens = h.tag_tokens

        short_term_vals = chart[stringify_ric_seqtype(Code.N225.value,
                                                      SeqType.RawShort)]
        long_term_vals = chart[stringify_ric_seqtype(Code.N225.value,
                                                     SeqType.RawLong)]

        processed_tokens = []
        for i in range(len(tag_tokens)):
            t = tag_tokens[i]
            if t.startswith('<yen val="') and t.endswith('"/>'):
                ref = fromstring(t).attrib['val']

                if len(short_term_vals) > 0 and len(long_term_vals) > 0:

                    prev_trading_day_close = Decimal(long_term_vals[0])
                    latest = Decimal(short_term_vals[0])
                    p = find_operation(ref, prev_trading_day_close, latest)
                    processed_tokens.append(p)
                else:
                    processed_tokens.append('<yen val="z"/>')
            else:
                processed_tokens.append(tag_tokens[i])

        alignment = Alignment(h.article_id, str(h.t), h.jst_hour,
                              processed_tokens, chart)
        alignments.append(alignment.to_dict())
    logger.info(
        'end creating alignments between headlines and price sequences.')
    return alignments
Exemplo n.º 5
0
def article_evaluation(article_id: str, method: str,
                       is_debug: bool) -> flask.Response:

    if method == 'POST':

        h = db \
            .session \
            .query(HumanEvaluation) \
            .filter(HumanEvaluation.article_id == article_id) \
            .one()

        form = flask.request.form
        nth = dict([(method_name, i + 1)
                    for (i, method_name) in enumerate(h.ordering)])

        note = form.get('note')
        h.note = None \
            if note is None or note.strip() == '' \
            else note

        fluency = form.get('fluency')
        h.fluency = None \
            if fluency is None or fluency.strip() == '' \
            else fluency

        informativeness = form.get('informativeness')
        h.informativeness = None \
            if informativeness is None or informativeness.strip() == '' \
            else informativeness

        r = db \
            .session \
            .query(GenerationResult) \
            .filter(GenerationResult.article_id == article_id,
                    GenerationResult.method_name == 'Base') \
            .one()
        r.correctness = form.get('correctness-{}'.format(nth['Base']))

        g = db \
            .session \
            .query(GenerationResult) \
            .filter(GenerationResult.article_id == article_id,
                    GenerationResult.method_name == 'Gold') \
            .one()
        g.correctness = form.get('correctness-{}'.format(nth['Gold']))

        e = db \
            .session \
            .query(GenerationResult) \
            .filter(GenerationResult.article_id == article_id,
                    GenerationResult.method_name == 'Ours') \
            .one()
        e.correctness = form.get('correctness-{}'.format(nth['Ours']))

        db.session.commit()

        referrer = flask.request.form.get('referrer', '/')
        return flask.redirect(referrer)
    else:
        headline = db \
            .session \
            .query(Headline.article_id,
                   Headline.simple_headline.label('gold_result'),
                   Headline.t,
                   func.to_char(in_jst(Headline.t), 'YYYY-MM-DD HH24:MI:SS').label('s_jst')) \
            .filter(Headline.article_id == article_id) \
            .one()

        ric_tables = create_ric_tables(db.session, config.rics,
                                       ric_to_ric_info, headline.t)
        group_size = 3
        while len(ric_tables) % 3 != 0:
            ric_tables.append(Table('', '', '', [], is_dummy=True))
        ric_table_groups = [
            ric_tables[i:i + group_size]
            for i in range(0, len(ric_tables), group_size)
        ]
        # It is better to share one procedure with the search,
        # but we keep this procedure for convenience
        target = db \
            .session \
            .query(HumanEvaluation) \
            .filter(HumanEvaluation.article_id == article_id) \
            .one_or_none()

        targets = []
        method_names = ['Gold'] if target.ordering is None else target.ordering
        if is_debug:
            method_names = order_method_names_for_debug(method_names)
        d = dict()
        m = []
        for method_name in method_names:
            res = db \
                  .session \
                  .query(GenerationResult.article_id,
                         GenerationResult.result,
                         GenerationResult.correctness) \
                  .filter(GenerationResult.article_id == article_id,
                          GenerationResult.method_name == method_name) \
                  .one_or_none()

            if res is not None:
                text = headline.gold_result \
                    if method_name == 'Gold' \
                    else res.result
                d[method_name] = EvalTarget(method_name, text, is_debug)
                m.append(method_name)

        note = '' if target.note is None else target.note
        fluency = '' if target.fluency is None else target.fluency
        informativeness = '' if target.informativeness is None else target.informativeness
        targets = [(i + 1, d[method_name])
                   for (i, method_name) in enumerate(m)]

        return flask.render_template(
            'human_evaluation.pug',
            title='debug' if is_debug else 'human-evaluation',
            article_id=headline.article_id,
            timestamp=headline.s_jst + ' JST',
            targets=targets,
            fluency=fluency,
            informativeness=informativeness,
            note=note,
            ric_table_groups=ric_table_groups)
Exemplo n.º 6
0
def list_targets_of_human_evaluation(is_debug: bool) -> flask.Response:
    args = flask.request.args
    page = int(args.get('page', default=1))
    conditions = []
    for i in range(5):
        field = args.get('field' + str(i))
        relation = args.get('rel' + str(i))
        val = args.get('val' + str(i))
        if field is not None and relation is not None and val is not None:
            constraint = construct_constraint_query(field.strip(),
                                                    relation.strip(),
                                                    val.strip())
            conditions.append(constraint)

    q = db \
        .session \
        .query(HumanEvaluation.article_id,
               HumanEvaluation.ordering,
               HumanEvaluation.is_target,
               (func.coalesce(HumanEvaluation.fluency, '')).label('fluency'),
               (func.coalesce(HumanEvaluation.informativeness, '')).label('informativeness'),
               (func.coalesce(HumanEvaluation.note, '')).label('note'),
               Headline.simple_headline.label('gold_result'),
               Headline.phase,
               func.to_char(in_jst(Headline.t), 'YYYY-MM-DD HH24:MI').label('jst')) \
        .outerjoin(Headline,
                   HumanEvaluation.article_id == Headline.article_id) \
        .filter(Headline.is_used.is_(True), *conditions) \
        .order_by(Headline.t)

    n_results = q.count()
    per_page = config.n_items_per_page
    articles = []
    for h in q.limit(per_page).offset((page - 1) * per_page).all():
        method_names = ['Gold'] if h.ordering is None else h.ordering
        if is_debug:
            method_names = order_method_names_for_debug(method_names)

        eval_targets = []
        for method_name in method_names:
            res = db \
                .session \
                .query(GenerationResult.article_id,
                       GenerationResult.result,
                       GenerationResult.correctness) \
                .filter(GenerationResult.article_id == h.article_id,
                        GenerationResult.method_name == method_name) \
                .one_or_none()

            if res is None:
                et = EvalTarget(method_name, h.gold_result, None) \
                     if method_name == 'Gold' \
                     else EvalTarget(method_name, '', None)
            else:
                text = h.gold_result \
                    if method_name == 'Gold' \
                    else res.result
                et = EvalTarget(method_name, text, is_debug)
                eval_targets.append(et)

        is_finished = \
            len(list(config.result.keys()) + ['Gold']) == \
            len(h.fluency) > 0 \
            and len(h.informativeness) > 0

        e = EvalListRow(h.article_id, h.jst, h.phase, eval_targets, h.fluency,
                        h.informativeness, h.note, h.is_target, is_finished)
        articles.append(e)

    if n_results == 0:
        display_msg = 'No headline is found'
    else:
        offset = (page - 1) * per_page + 1
        end = offset + per_page - 1 if page < (n_results //
                                               per_page) else n_results
        display_msg = 'Displaying {:,} to {:,} of {:,}'.format(
            offset, end, n_results)

    pagination = DummyPagination(has_prev=page > 1,
                                 has_next=page < (n_results // per_page),
                                 display_msg=display_msg)

    return flask.render_template(
        'list_human_evaluation.pug',
        title='debug' if is_debug else 'human-evaluation',
        condition=conditions,
        articles=articles,
        pagination=pagination)
Exemplo n.º 7
0
def fetch_close(session: Session, ric: str, jst: datetime) -> float:
    result = session \
        .query(Price.val) \
        .filter(cast(in_jst(Close.t), Date) == jst.date(), Close.ric == ric, Close.t == Price.t, Price.ric == ric) \
        .scalar()
    return float(result) if result is not None else None