Пример #1
0
def batch_detect_language(n_reviews):
    src_sql = \
        """
        select id, review
        from integration.review
        where
        id >= %(next_id)s
        and review_lang IS NULL
        and review IS NOT NULL
        order by 1
        limit %(nb)s;
        """
    upd_sql = \
        """
        update integration.review as t set review_lang = s.lang
        from (values {values_src}
        ) as s(id, lang)
        where s.id = t.id;
        """
    global next_rev_id
    step_size = 1000

    for i in xrange(0, n_reviews, step_size):
        rows_src = elt.get_ro_connection().fetch_all(src_sql, params={'next_id': next_rev_id, 'nb': step_size}, in_trans=True)
        tgt_list = [(rev_t[0], detect_text_language(rev_t[1])) for rev_t in rows_src]

        values_src = str(tgt_list).strip("[]").replace("u'", "'").replace("L,", ",")
        update_sql = upd_sql.format(values_src=values_src)
        rowcount = elt .get_connection().execute_inTransaction(update_sql)

        next_rev_id = rows_src[-1][0]
        logger.info("Batch detect language processed %d reviews (last rowcount= %d, last id= %d )" % (i, rowcount, next_rev_id))
Пример #2
0
def fetch_workIds_no_info(nb_work):
    sql = \
        """
        select w.refid
        from integration.work w
        left join integration.work_info wi on (w.refid = wi.work_refid)
        left join integration.work_sameas s on (w.refid = s.work_refid)
        where wi.work_refid IS NULL
        and s.work_refid IS NULL
        limit %(nb)s
        """
    return elt.get_ro_connection().fetch_all(sql, {'nb': nb_work}, as_dict=True)
Пример #3
0
 def run(self):
     # order by work_refid to align with review harvesting execution
     sql = \
         """
         select w.ean::text
         from integration.work_isbn w
         join integration.work_info wi on (w.work_refid = wi.work_refid)
         left join integration.isbn_info i on (w.ean = i.ean)
         where i.lang_code IS NULL
         order by wi.popularity
         --order by w.work_refid
         limit %(nb)s
         """
     conn = elt.get_ro_connection()
     tup_list = conn.fetch_all(sql, params={'nb': self.n_isbn})
     f = self.output().open('w')
     f.write(";".join([tup[0] for tup in tup_list]))
     f.close()
Пример #4
0
def fetch_ltwids_stat_harvested(nb_work):
    sql = \
        """
        with cnt_per_lang as (
            select r.work_refid as work_refid, r.review_lang, count(1)
            from integration.review r
            join integration.site s on (s.id = r.site_id and s.logical_name = 'librarything')
            group by 1,2
        )
        select w.refid, cast(w.last_harvest_dts as date) as last_harvest_date,
                array_agg(review_lang) as langs, array_agg(cnt) as cnts
        from work w
        left join cnt_per_lang c on (w.refid = c.work_refid)
        where w.last_harvest_dts IS NOT NULL
        group by 1,2
        order by 2 asc
        limit %(nb)s
        """
    return elt.get_ro_connection().fetch_all(sql, {'nb': nb_work}, as_dict=True)
Пример #5
0
def fetch_workIds_not_harvested(site_logical_name, nb_work, lang_code='eng', orderby_pop=False):
    """
    Fetch work_refid/isbns not yet harvested (mapping not present)
    using a source work_info and isbn_info (for lang) and, optionally order by popularity (more popular first)

    For lt, return ids for work that had their reference harvested.

    :param nb_work: number of work-ids to fetch
    :param lang_code: filter isbn on lang_code (ex. to limit isbn for lang-specific site).
    (not used for lt)
    :return:
    """
    # These two are just temp and should be removed...
    sql_temp_harvested_in_babelio_not_lt = \
        """
        select m.work_refid
        from integration.work_site_mapping m
        join integration.work w on (w.refid = m.work_refid and site_id = 4 and w.last_harvest_dts is null)
        order by {order_by}
        limit %(nb)s
        """
    sql_temp_harvested_in_babelio_not_gr = \
        """
        select ba.work_refid, array_agg(wi.ean) as isbns
        from integration.work_site_mapping ba
        left join integration.work_site_mapping gr on (ba.work_refid = gr.work_refid and ba.site_id = 4 and gr.site_id = 2)
        left join integration.work_sameas s on (ba.work_refid = s.work_refid)
        join integration.work_isbn wi on (ba.work_refid = wi.work_refid)
        where
        ba.site_id = 4
        and gr.work_refid is null
        and s.work_refid is null
        group by 1
        limit %(nb)s
        """

    sql_other = \
        """
        with ref as (
            select  coalesce(same.master_refid, w.work_refid) as wid
                    , w.popularity
                    , array_agg(wi.ean) as isbn_list
            from integration.work_info w
            left join integration.work_sameas same on (w.work_refid = same.work_refid)
            join integration.work_isbn wi on (wi.work_refid = w.work_refid and wi.deletion_date IS NULL )
            join integration.isbn_info ii on (wi.ean = ii.ean and ii.lang_code = %(lang)s)
            group by 1,2
        )
        select ref.wid as work_refid, ref.isbn_list as isbns
        from ref
        left join
            (select work_refid, last_harvest_dts
             from integration.work_site_mapping m
             join integration.site s on (m.site_id = s.id and s.logical_name = %(name)s)
            ) as mapped on (mapped.work_refid = ref.wid)
        where mapped.last_harvest_dts IS NULL
        order by {order_by}
        limit %(nb)s
        """
    # select only ones with work_info harvested
    sql_lt = \
        """
        select wi.work_refid
        from integration.work_info wi
        inner join integration.work w on (wi.work_refid = w.refid)
        where
        w.last_harvest_dts IS NULL
        order by {order_by}
        limit %(nb)s
        """

    if orderby_pop:
        order_by = 'popularity'
    else:
        order_by = '1'

    if site_logical_name == 'librarything':
        sql = sql_temp_harvested_in_babelio_not_lt.format(order_by=order_by)
        return elt.get_ro_connection().fetch_all(sql, {'nb': nb_work}, as_dict=True)
    else:
        sql = sql_temp_harvested_in_babelio_not_gr.format(order_by=order_by)
        return elt.get_ro_connection().fetch_all(sql, {'nb': nb_work,
                                                       'name': site_logical_name,
                                                       'lang': lang_code}, as_dict=True)