def read_page(data_part, source): import gqueue import requests log_line("read_page: " + source) headers = data_part["headers"] with requests.Session() as ss: ss.headers.update(headers) with ss.get(source) as resp: content_type = resp.headers.get('content-type') chset = 'utf-8' if content_type: match_chset = re.search(r'(charset\W+)([^\;]+)', content_type, flags=re.IGNORECASE) if match_chset and len(match_chset.groups()) > 1: chset = match_chset[2] else: log_line( "read_page: content_type is {}".format(content_type)) # chset = resp.encoding page = resp.content.decode(chset) result = dict(data_part) result["page"] = page log_line("read_page: page length is {}, chrset is {}".format( len(page), chset)) gqueue.add_task(body=gqueue.save_body(json.dumps(result), Q_KEY_TYPE), uri=flask.url_for('web_mon.run_data_part')) return str(len(result))
def task_email_forward(): res = gqueue.add_task(body='{"mailfolder": "2Vera", "receiver": "*****@*****.**"}', uri=flask.url_for('gqueue.forward_next_email')) # res = gqueue.add_task(body='{"mailfolder": "2Vera", "receiver": "*****@*****.**"}', uri=flask.url_for('gqueue.forward_next_email')) # return res return res.name
def run_mon(): import gqueue mon_setup = read_setup() for fs in mon_setup[:-1]: for da in fs["data"]: data_part = dict(da) data_part["headers"] = fs["headers"] gqueue.add_task(body=gqueue.save_body(json.dumps(data_part), Q_KEY_TYPE), uri=flask.url_for('web_mon.run_data_part')) for ls in mon_setup[-1:]: for da in ls["data"][:-1]: data_part = dict(da) data_part["headers"] = ls["headers"] gqueue.add_task(body=gqueue.save_body(json.dumps(data_part), Q_KEY_TYPE), uri=flask.url_for('web_mon.run_data_part')) for da in ls["data"][-1:]: data_part = dict(da) data_part["headers"] = ls["headers"] data_part["last"] = True gqueue.add_task(body=gqueue.save_body(json.dumps(data_part), Q_KEY_TYPE), uri=flask.url_for('web_mon.run_data_part')) return "Ok - " + str(len(mon_setup))
def send_results(): import gqueue if gqueue.is_tasks_exist(except_of=flask.url_for('web_mon.send_results')): gqueue.add_task(body=json.dumps({"go": True}), uri=flask.url_for('web_mon.send_results'), in_seconds=60) return "Rerun later" from google.cloud import datastore dcli = datastore.Client(project=gqueue.PROJECT_ID) query = dcli.query(kind=KIND_WM_RESULT, order=["send_to"]) send_to = '##' body = None msg = {} keys = [] log_line("send_results: query {}".format(str(query))) sender_count = 0 for ent in query.fetch(): recipient = ent.get("send_to").lower() if recipient != send_to: if body and msg: msg["body"] = body msg["keys"] = keys gqueue.add_task(body=gqueue.save_body(json.dumps(msg), Q_SEND_MAIL), uri=flask.url_for('web_mon.send_email_result'), in_seconds=120 + sender_count * 5) sender_count += 1 body = "" msg = {"send_to": recipient} keys.clear() send_to = recipient body += ent.get("body") keys.append({"kind": ent.key.kind, "name": ent.key.name}) if body and msg: msg["body"] = body msg["keys"] = keys gqueue.add_task(body=gqueue.save_body(json.dumps(msg), Q_SEND_MAIL), uri=flask.url_for('web_mon.send_email_result'), in_seconds=120 + sender_count * 5) query = None dcli = None return "Ok"
def handling_sources_bunch(): parts_json = get_body() parts = json.loads(parts_json) log_line("handling_sources_bunch: sources - {}".format(len(parts))) import gqueue parts_count = len(parts) # Всего записей if parts_count > MAX_SOURCES_BUNCH: from math import ceil bunches_count = ceil( parts_count / MAX_SOURCES_BUNCH) # Количество пакетов для отправки send_count = parts_count # Отправляемое количество записей для обработки if bunches_count > MAX_SOURCES_BUNCH: bunches_count = MAX_SOURCES_BUNCH - 1 # Количество пакетов с записями для обработки send_count = bunches_count * MAX_SOURCES_BUNCH min_cnt = MAX_SOURCES_BUNCH - ( send_count % MAX_SOURCES_BUNCH ) # Количество записей не хватающих до максимума min_cnt = min_cnt if min_cnt < MAX_SOURCES_BUNCH else 0 max_recs = int( min_cnt / bunches_count ) # На сколько записей нужно уменьшить в пакете с максимальным количеством записей min_cnt = min_cnt % bunches_count # Сколько записей нужно уменьшить в пакете до минимального количества part_lens = [ MAX_SOURCES_BUNCH - max_recs - 1 if ii < min_cnt else MAX_SOURCES_BUNCH - max_recs for ii in range(bunches_count) ] cur_part = 0 for cnt in part_lens: links_list = parts[cur_part:cur_part + cnt] cur_part += cnt gqueue.add_task( body=gqueue.save_body(json.dumps(links_list), Q_KEY_TYPE), uri=flask.url_for('web_mon.handling_sources_bunch')) links_list = parts[cur_part:] if links_list: gqueue.add_task( body=gqueue.save_body(json.dumps(links_list), Q_KEY_TYPE), uri=flask.url_for('web_mon.handling_sources_bunch')) else: time_shift = 0 for src in parts: gqueue.add_task(body=gqueue.save_body(json.dumps(src), Q_KEY_TYPE), uri=flask.url_for('web_mon.run_data_part'), in_seconds=int(time_shift / 5) * 2) time_shift += 1 log_line("handling_sources_bunch: time shift {}".format(time_shift)) return "Ok"
def page_desc(data_part, page): src = list(data_part.get("sources")) src_type = src[0].get("type") body = None links = None errs = None log_line("page_desc: Type is {}".format(src_type)) if src_type == 'jsp': body, links, errs = plain_json(page, src[0]) elif src_type == 'xsearch_xp': import html as py_html py_html.entities.html5["nbsp"] = ' ' py_html.entities.html5["nbsp;"] = ' ' body, links, errs = xsearch(py_html.unescape(page), src[0]) if body: save_mon_result(body, data_part.get("send_to")) if errs: for e in errs: log_line("page_desc error - {}".format(e)) return "has " + str(len(errs)) + " errors" log_line("page_desc: body - {}, links - {}, err - {}".format( body if body is None else len(body), links if links is None else len(links), errs if errs is None else len(errs))) result = {} import gqueue has_last = data_part.get("last") if len(src) > 1: result["headers"] = data_part["headers"].copy() result["send_to"] = data_part["send_to"] if links: first_source = src[1].get("source") has_source = first_source and not first_source.startswith('#') if has_last: first_lnk = -1 last_lnk = -1 else: first_lnk = None last_lnk = len(links) links_list = [] import copy for lnk in links[:first_lnk]: if not has_source: src[1]["source"] = str(lnk) result["sources"] = list(src[1:]) # gqueue.add_task(body=gqueue.save_body(json.dumps(result), Q_KEY_TYPE), # uri=flask.url_for('web_mon.run_data_part')) links_list.append(copy.deepcopy(result)) for lnk in links[last_lnk:None]: if not has_source: src[1]["source"] = str(lnk) result["sources"] = list(src[1:]) result["last"] = True # gqueue.add_task(body=gqueue.save_body(json.dumps(result), Q_KEY_TYPE), # uri=flask.url_for('web_mon.run_data_part')) links_list.append(copy.deepcopy(result)) if len(links_list) == 1: gqueue.add_task(body=gqueue.save_body( json.dumps(links_list[0]), Q_KEY_TYPE), uri=flask.url_for('web_mon.run_data_part')) else: gqueue.add_task( body=gqueue.save_body(json.dumps(links_list), Q_KEY_TYPE), uri=flask.url_for('web_mon.handling_sources_bunch')) else: result["sources"] = src[1:] if has_last: result["last"] = True gqueue.add_task(body=gqueue.save_body(json.dumps(result), Q_KEY_TYPE), uri=flask.url_for('web_mon.run_data_part')) elif has_last: result["go"] = True gqueue.add_task(body=json.dumps(result), uri=flask.url_for('web_mon.send_results'), in_seconds=60) return str(len(result))
def run_web_mon(): res = gqueue.add_task(body=None, uri=flask.url_for('web_mon.run_mon')) return res.name
def bee_serv_check(): res = gqueue.add_task(body=None, uri=flask.url_for('gqueue.view_bee_service')) return res.name