def calculate_hash(self, spec): cache_hash = None if spec.pipeline_id in self.all_pipeline_ids: message = 'Duplicate key {0} in {1}' \ .format(spec.pipeline_id, spec.abspath) spec.errors.append(SpecError('Duplicate Pipeline Id', message)) else: cache_hash = resolve_dependencies(spec, self.all_pipeline_ids) if len(spec.errors) > 0: return cache_hash for step in spec.pipeline_details['pipeline']: m = hashlib.md5() m.update(cache_hash.encode('ascii')) with open(step['executor'], 'rb') as f: m.update(f.read()) m.update( json.dumps(step, ensure_ascii=True, sort_keys=True).encode('ascii')) cache_hash = m.hexdigest() step['_cache_hash'] = cache_hash self.all_pipeline_ids[spec.pipeline_id] = spec spec.cache_hash = cache_hash
def get_office_data(row, page, documents): input_fields_text_map = { "publication_id": "SERIAL_NUMBER", "publishnum": "PublishNum", "description": "PublicationName", "publisher": "Publisher", "claim_date": "ClaimDate", "last_update_date": "UpdateDate", "subjects": "PublicationSUBJECT", "publish_date": "PublishDate", "status": "PublicationSTATUS" } source_data = { k: page("#ctl00_PlaceHolderMain_lbl_{}".format(v)).text() for k, v in input_fields_text_map.items() } publication_id = publication_id_from_url(row["url"]) if str(publication_id) != str(source_data["publication_id"]): raise Exception("invalid or blocked response") return { "publisher_id": int(row["id"]), "publication_id": publication_id, "tender_type": "office" if source_data['publisher'] != 'משרד האוצר - מינהל הרכש הממשלתי' else 'central', "page_url": row["url"], "description": source_data["description"], # "supplier_id": None, # "supplier": None, # "contact": None, "publisher": source_data["publisher"], # "contact_email": None, "claim_date": parse_datetime(source_data["claim_date"]), "last_update_date": parse_date(source_data["last_update_date"]), # "reason": None, # "source_currency": None, # "regulation": None, # "volume": None, "subjects": source_data["subjects"], "start_date": parse_date(source_data["publish_date"]), # "end_date": None, "decision": source_data["status"], # "page_title": None, "tender_id": source_data["publishnum"] or 'none', "documents": json.dumps(documents, sort_keys=True, ensure_ascii=False), }
def __setitem__(self, key, value): conn = sqlite3.connect(self.filename) value = json.dumps(value) cursor = conn.cursor() cursor.execute('DELETE FROM d where _key=?', (key, )) cursor.execute('INSERT INTO d VALUES (?,?)', (key, value)) conn.commit() conn.close()
def set(self, key, value): value = json.dumps(value) try: self.get(key) self.cursor.execute('''UPDATE d SET value=? WHERE key=?''', (value, key)) except KeyError: self.cursor.execute('''INSERT INTO d VALUES (?, ?)''', (key, value)) self.db.commit()
def get_exemptions_data(self, row, page, documents): input_fields_text_map = { "publication_id": "SERIAL_NUMBER", "description": "PublicationName", "supplier_id": "SupplierNum", "supplier": "SupplierName", "contact": "ContactPersonName", "publisher": "PUBLISHER", "contact_email": "ContactPersonEmail", "claim_date": "ClaimDate", "last_update_date": "UpdateDate", "reason": "PtorReason", "source_currency": "Currency", "regulation": "Regulation", "volume": "TotalAmount", "subjects": "PublicationSUBJECT", "start_date": "StartDate", "end_date": "EndDate", "decision": "Decision", "page_title": "PublicationType", } source_data = { k: page("#ctl00_PlaceHolderMain_lbl_{}".format(v)).text() for k, v in input_fields_text_map.items() } publication_id = publication_id_from_url(row["url"]) if str(publication_id) != str(source_data["publication_id"]): raise Exception("invalid or blocked response") return { "publisher_id": int(row["pid"]), "publication_id": publication_id, "tender_type": "exemptions", "page_url": row["url"], "description": source_data["description"], "supplier_id": source_data["supplier_id"], "supplier": source_data["supplier"], "contact": source_data["contact"], "publisher": source_data["publisher"], "contact_email": source_data["contact_email"], "claim_date": parse_datetime(source_data["claim_date"]), "last_update_date": parse_date(source_data["last_update_date"]), "reason": source_data["reason"], "source_currency": source_data["source_currency"], "regulation": source_data["regulation"], "volume": source_data["volume"], "subjects": source_data["subjects"], "start_date": parse_date(source_data["start_date"]), "end_date": parse_date(source_data["end_date"]), "decision": source_data["decision"], "page_title": source_data["page_title"], "tender_id": "none", "documents": json.dumps(documents, sort_keys=True, ensure_ascii=False) }
def _tostr(value): if isinstance(value, str): return value elif value is None: return '' elif isinstance(value, (int, float, bool, Decimal)): return str(value) elif isinstance(value, date): return value.isoformat() elif isinstance(value, (list, dict)): return json.dumps(value) assert False, "Internal error - don't know how to handle %r of type %r" % (value, type(value))
async def events(request: web.Request): loop = request.app.loop uuid = request.match_info['id'] async with sse_response(request, headers=CORS_HEADERS) as resp: try: async with ProcessRunner(loop, uuid) as process: print('starting!', uuid) async for line in LineReader(process.stderr): if line is None: continue resp.send(line) print('done!', uuid) resp.send('close') except Exception as e: msg = 'General Error %s' % e resp.send(json.dumps({'e': 'err', 'msg': msg, 'uuid': 'general'})) return resp
def get_central_data(self, row, page, documents): # michraz_number = page("#ctl00_PlaceHolderMain_MichraznumberPanel div.value").text().strip() documents = [] for elt in page("#ctl00_PlaceHolderMain_SummaryLinksPanel_SummaryLinkFieldControl1__ControlWrapper_SummaryLinkFieldControl a"): documents.append({"description": ' '.join(elt.text.strip().split()), "link": elt.attrib["href"], "update_time": None}) for elt in page("#ctl00_PlaceHolderMain_SummaryLinks2Panel"): documents.append({"description": ' '.join(pq(elt).text().strip().split()), "link": pq(elt).find("a")[0].attrib["href"], "update_time": None}) publication_id = page("#ctl00_PlaceHolderMain_ManofSerialNumberPanel div.value").text().strip() outrow = { "publisher_id": None, "publication_id": int(publication_id) if publication_id else 0, "tender_type": "central", "page_url": row["url"], "description": page("#ctl00_PlaceHolderMain_GovXContentSectionPanel_Richhtmlfield1__ControlWrapper_RichHtmlField").text().strip(), "supplier_id": None, "supplier": page("#ctl00_PlaceHolderMain_GovXParagraph1Panel_ctl00__ControlWrapper_RichHtmlField div").text().strip(), "contact": page("#ctl00_PlaceHolderMain_WorkerPanel_WorkerPanel1 div.worker").text().strip(), "publisher": None, "contact_email": None, "claim_date": None, "last_update_date": None, "reason": None, "source_currency": None, "regulation": page("#ctl00_PlaceHolderMain_MIchrazTypePanel div.value").text().strip(), "volume": None, "subjects": page("#ctl00_PlaceHolderMain_MMDCategoryPanel div.value").text().strip(), "start_date": None, "end_date": parse_date(page("#ctl00_PlaceHolderMain_TokefEndDatePanel div.Datevalue").text().strip()), "decision": page("#ctl00_PlaceHolderMain_MichrazStatusPanel div.value").text().strip(), "page_title": page("h1.MainTitle").text().strip(), "tender_id": tender_id_from_url(row["url"]), "documents": json.dumps(documents, sort_keys=True, ensure_ascii=False), } if outrow["description"] == "" and outrow["supplier"] == "" and outrow["subjects"] == "": raise Exception("invalid or blocked response") return outrow
from datapackage_pipelines.utilities.extended_json import json from datapackage_pipelines.wrapper import spew, ingest parameters, datapackage, res_iter = ingest() res_name = parameters.get('resource', datapackage['resources'][0]['name']) def show_sample(res): logging.info('SAMPLE OF LINES from %s', res.spec['name']) for i, row in enumerate(res): if i < 10: if isinstance(row, LazyJsonLine): logging.info('#%s: %s', i, row._evaluate()) else: logging.info('#%s: %r', i, row) yield row def process_resources(res_iter_): for res in res_iter_: logging.info('? from %s', res.spec['name']) if res.spec['name'] == res_name: yield show_sample(res) else: yield res logging.info(json.dumps(datapackage, indent=2)) spew(datapackage, process_resources(res_iter))
def set_status(self, pipeline_id, status): if self.is_init(): self.redis.set(pipeline_id, json.dumps(status, ensure_ascii=True))
def jsonize(obj): return json.dumps(obj)
def calc_equivs(cur_year, rows, connected_items, new_connected_items, to_delete): # rows = list(rows) # logging.info('cur_year: %r, num rows = %d, prev_year=%d', cur_year, len(rows), len(list(connected_items.iterator()))) # logging.info('connected_items: %r', connected_items) # logging.info('new_connected_items: %r', new_connected_items) mapped_levels = {} unmatched = [] for row in rows: row = normalize(row) equivs = [] parent = row['parent'] children = row['children'] ids = [{'code': row['code'], 'title': row['title']}] while len(ids) > 0: logging.debug('%d/%r: ids: %r', cur_year, row['code'], ids) id = ids.pop(0) test_value = sum( abs(row[f]) for f in ('net_allocated', 'gross_allocated', 'net_revised', 'commitment_allocated', 'net_used') if row.get(f) is not None) non_repeating = row.get('non_repeating', []) non_repeating = '1' in non_repeating and len(non_repeating) == 1 if (test_value == 0 and not row['code'].endswith('99')) or non_repeating: unmatched.append(row) row = None break # Find curated record for id curated_items = curated.get((cur_year, id['code'])) if curated_items is not None: if len(curated_items) == 0: unmatched.append(row) row = None break for year, code in curated_items: assert year == cur_year - 1 value = get(connected_items, code) if value is not None: equivs.append(value) else: logging.warning( '%d/%s: Failed to find curated item %s/%s', cur_year, id['code'], year, code) if len(equivs) > 0: logging.debug('FOUND CURATED ITEM for %r', id) continue else: logging.warning('FOUND 0 CURATED ITEMS for %r', id) # Find connected item with same code and title connected_item = get(connected_items, id['code']) if connected_item is not None: if similar(id['title'], connected_item['title']): logging.debug('FOUND EXACT ITEM for %r', id) equivs.append(connected_item) continue # Try to find similar named items which moved to a new parent if parent is not None: connected_item = get(new_connected_items, parent) if connected_item is not None: parent = None assert connected_item['year'] == cur_year prev_year_rows = connected_item['history'].get( cur_year - 1, []) candidates = [] for prev_year_row in prev_year_rows: prev_year_children = prev_year_row['children'] if prev_year_children is None: continue for prev_year_child in prev_year_children: if similar(prev_year_child['title'], id['title']): candidates.append(prev_year_row) if len(candidates) == 1: connected_item = get(connected_items, candidates[0]['code']) if connected_item is not None: logging.debug('FOUND MOVED ITEM for %r', id) equivs.append(connected_item) continue # Split into children if children is not None and len(children) > 0: logging.debug('SPLITTING TO CHILDREN for %r', id) ids.extend({ 'code': x['code'], 'title': x['title'] } for x in children) children = None continue # Couldn't find match - no point in continuing logging.debug('FAILED TO FIND MATCH for %s/%s', cur_year, id) unmatched.append(row) row = None break # Found match if row is not None: assert len(equivs) > 0 new_history = {} # logging.info(', '.join(x['code'] for x in equivs)) codes = set() for equiv in equivs: if equiv['code'] in codes: continue codes.add(equiv['code']) s = mapped_levels.setdefault(equiv['code'], set()) if len(row['code']) in s: logging.warning('DOUBLE BOOKING for %s/%s from %s/%s', equiv['year'], equiv['code'], row['year'], row['code']) for nci in iterate_values(new_connected_items): for hist_item in nci.get('history', {}).get(equiv['year'], []): if hist_item['code'] == equiv['code']: logging.warning('FOUND') logging.warning('%s', json.dumps(nci, indent=2)) else: s.add(len(row['code'])) to_delete.add(equiv['code']) for year, hist_item in equiv['history'].items(): update_equiv(new_history.setdefault(year, {}), hist_item) update_equiv(new_history.setdefault(equiv['year'], {}), equiv) row['history'] = new_history put(new_connected_items, row['code'], row) logging.error('UNMATCHED %d: %d', cur_year, len(unmatched)) return unmatched
def put(db, key, value): assert value is not None enc = json.dumps(value) db.put(key.encode('utf8'), enc.encode('ascii'))
def get_central_data(row, page, documents): # michraz_number = page("#ctl00_PlaceHolderMain_MichraznumberPanel div.value").text().strip() outrow = copy.deepcopy(row) publication_id = page( "#ctl00_PlaceHolderMain_ManofSerialNumberPanel div.value").text( ).strip() if publication_id: ot_url = BASE_URL + '/officestenders/Pages/officetender.aspx?pID={}'.format( publication_id) ot_page = pq(_get_url_response_text(ot_url)) documents = extract_documents(ot_page) outrow['url'] = ot_url outrow['id'] = -1 outrow = get_office_data(outrow, ot_page, documents) else: logging.info('no publication id, continuing') dd = [] for elt in page( "#ctl00_PlaceHolderMain_SummaryLinksPanel_SummaryLinkFieldControl1__ControlWrapper_SummaryLinkFieldControl a" ): link = elt.attrib["href"] if 'officestenders/Pages/officetender.aspx?pID=' not in link: dd.append((' '.join(elt.text.strip().split()), link)) for elt in page("#ctl00_PlaceHolderMain_SummaryLinks2Panel"): link = pq(elt).find("a")[0].attrib["href"] if 'officestenders/Pages/officetender.aspx?pID=' not in link: dd.append((' '.join(pq(elt).text().strip().split()), link)) documents.extend( dict(description=d[0], link=d[1], update_time=None) for d in dd) description = ' '.join([ page( "#ctl00_PlaceHolderMain_GovXContentSectionPanel_Richhtmlfield1__ControlWrapper_RichHtmlField" ).text().strip(), page( "#ctl00_PlaceHolderMain_GovXParagraph1Panel_ctl00__ControlWrapper_RichHtmlField div" ).text().strip() ]).strip() outrow.update({ "publication_id": int(publication_id) if publication_id else 0, "tender_type": "central", "page_url": row["url"], "description": description, "contact": page("#ctl00_PlaceHolderMain_WorkerPanel_WorkerPanel1 div.worker" ).text().strip(), "regulation": page("#ctl00_PlaceHolderMain_MIchrazTypePanel div.value").text().strip( ), "subjects": page("#ctl00_PlaceHolderMain_MMDCategoryPanel div.value").text().strip( ), "end_date": (parse_date( page("#ctl00_PlaceHolderMain_TokefEndDatePanel div.Datevalue"). text().strip()) or parse_date( page("#ctl00_PlaceHolderMain_HoraatShaaEndDatePanel div.Datevalue" ).text().strip())), "start_date": (outrow.get('start_date') or parse_date( page( "#ctl00_PlaceHolderMain_HoraatShaaStartDatePanel div.Datevalue" ).text().strip())), "decision": page("#ctl00_PlaceHolderMain_MichrazStatusPanel div.value").text( ).strip(), "page_title": page("h1.MainTitle").text().strip(), "tender_id": tender_id_from_url(row["url"]), "documents": json.dumps(documents, sort_keys=True, ensure_ascii=False), }) if not any(outrow.get(x) for x in ("description", "supplier", "subjects")): raise Exception("invalid or blocked response") return outrow
def _send(self, msg): msg['uuid'] = self.uuid if only_last and self.uuid == 'last': logging.info(json.dumps(msg)) elif not only_last and self.uuid != 'last': logging.info(json.dumps(msg))