def batch_detect_language(n_reviews): src_sql = \ """ select id, review from integration.review where id >= %(next_id)s and review_lang IS NULL and review IS NOT NULL order by 1 limit %(nb)s; """ upd_sql = \ """ update integration.review as t set review_lang = s.lang from (values {values_src} ) as s(id, lang) where s.id = t.id; """ global next_rev_id step_size = 1000 for i in xrange(0, n_reviews, step_size): rows_src = elt.get_ro_connection().fetch_all(src_sql, params={'next_id': next_rev_id, 'nb': step_size}, in_trans=True) tgt_list = [(rev_t[0], detect_text_language(rev_t[1])) for rev_t in rows_src] values_src = str(tgt_list).strip("[]").replace("u'", "'").replace("L,", ",") update_sql = upd_sql.format(values_src=values_src) rowcount = elt .get_connection().execute_inTransaction(update_sql) next_rev_id = rows_src[-1][0] logger.info("Batch detect language processed %d reviews (last rowcount= %d, last id= %d )" % (i, rowcount, next_rev_id))
def fetch_workIds_no_info(nb_work): sql = \ """ select w.refid from integration.work w left join integration.work_info wi on (w.refid = wi.work_refid) left join integration.work_sameas s on (w.refid = s.work_refid) where wi.work_refid IS NULL and s.work_refid IS NULL limit %(nb)s """ return elt.get_ro_connection().fetch_all(sql, {'nb': nb_work}, as_dict=True)
def run(self): # order by work_refid to align with review harvesting execution sql = \ """ select w.ean::text from integration.work_isbn w join integration.work_info wi on (w.work_refid = wi.work_refid) left join integration.isbn_info i on (w.ean = i.ean) where i.lang_code IS NULL order by wi.popularity --order by w.work_refid limit %(nb)s """ conn = elt.get_ro_connection() tup_list = conn.fetch_all(sql, params={'nb': self.n_isbn}) f = self.output().open('w') f.write(";".join([tup[0] for tup in tup_list])) f.close()
def fetch_ltwids_stat_harvested(nb_work): sql = \ """ with cnt_per_lang as ( select r.work_refid as work_refid, r.review_lang, count(1) from integration.review r join integration.site s on (s.id = r.site_id and s.logical_name = 'librarything') group by 1,2 ) select w.refid, cast(w.last_harvest_dts as date) as last_harvest_date, array_agg(review_lang) as langs, array_agg(cnt) as cnts from work w left join cnt_per_lang c on (w.refid = c.work_refid) where w.last_harvest_dts IS NOT NULL group by 1,2 order by 2 asc limit %(nb)s """ return elt.get_ro_connection().fetch_all(sql, {'nb': nb_work}, as_dict=True)
def fetch_workIds_not_harvested(site_logical_name, nb_work, lang_code='eng', orderby_pop=False): """ Fetch work_refid/isbns not yet harvested (mapping not present) using a source work_info and isbn_info (for lang) and, optionally order by popularity (more popular first) For lt, return ids for work that had their reference harvested. :param nb_work: number of work-ids to fetch :param lang_code: filter isbn on lang_code (ex. to limit isbn for lang-specific site). (not used for lt) :return: """ # These two are just temp and should be removed... sql_temp_harvested_in_babelio_not_lt = \ """ select m.work_refid from integration.work_site_mapping m join integration.work w on (w.refid = m.work_refid and site_id = 4 and w.last_harvest_dts is null) order by {order_by} limit %(nb)s """ sql_temp_harvested_in_babelio_not_gr = \ """ select ba.work_refid, array_agg(wi.ean) as isbns from integration.work_site_mapping ba left join integration.work_site_mapping gr on (ba.work_refid = gr.work_refid and ba.site_id = 4 and gr.site_id = 2) left join integration.work_sameas s on (ba.work_refid = s.work_refid) join integration.work_isbn wi on (ba.work_refid = wi.work_refid) where ba.site_id = 4 and gr.work_refid is null and s.work_refid is null group by 1 limit %(nb)s """ sql_other = \ """ with ref as ( select coalesce(same.master_refid, w.work_refid) as wid , w.popularity , array_agg(wi.ean) as isbn_list from integration.work_info w left join integration.work_sameas same on (w.work_refid = same.work_refid) join integration.work_isbn wi on (wi.work_refid = w.work_refid and wi.deletion_date IS NULL ) join integration.isbn_info ii on (wi.ean = ii.ean and ii.lang_code = %(lang)s) group by 1,2 ) select ref.wid as work_refid, ref.isbn_list as isbns from ref left join (select work_refid, last_harvest_dts from integration.work_site_mapping m join integration.site s on (m.site_id = s.id and s.logical_name = %(name)s) ) as mapped on (mapped.work_refid = ref.wid) where mapped.last_harvest_dts IS NULL order by {order_by} limit %(nb)s """ # select only ones with work_info harvested sql_lt = \ """ select wi.work_refid from integration.work_info wi inner join integration.work w on (wi.work_refid = w.refid) where w.last_harvest_dts IS NULL order by {order_by} limit %(nb)s """ if orderby_pop: order_by = 'popularity' else: order_by = '1' if site_logical_name == 'librarything': sql = sql_temp_harvested_in_babelio_not_lt.format(order_by=order_by) return elt.get_ro_connection().fetch_all(sql, {'nb': nb_work}, as_dict=True) else: sql = sql_temp_harvested_in_babelio_not_gr.format(order_by=order_by) return elt.get_ro_connection().fetch_all(sql, {'nb': nb_work, 'name': site_logical_name, 'lang': lang_code}, as_dict=True)