def dump_orgs(): orgs = load_utf_json(RAW_JSON_FNAME) for org in orgs: for rubric in [TARGETS, ACTIVITIES, PROJECTS, SERVICES]: item = org[rubric] if item: org[rubric] = str_to_list(item) raw_codes = org[CODES] if raw_codes: codes = str_to_list(raw_codes) org[CODES] = codes org[OGRN] = find_code(codes, r'ОГРН') org[INN] = find_code(codes, r'ИНН') org[KPP] = find_code(codes, r'КПП') org[OKPO] = find_code(codes, r'ОКПО') org[OKATO] = find_code(codes, r'ОКАТО') else: org[OGRN], org[INN], org[KPP], org[OKPO], org[ OKATO] = None, None, None, None, None if org[SOURCE] == 'http://nko71.ru/katalog-nko/nko-po-uslugam/sotsialnaya-pomoshch-i-podderzhka/nasledie.html': org[OGRN] = '1097100001129' if org[SOURCE] == 'http://nko71.ru/katalog-nko/nko-po-gruppam-naseleniya/zhenshchiny-semi-s-detmi/soyuz-' +\ 'pravoslavnykh-zhenshchin.html': org[INN] = '7116511663' org[KPP] = '711601001' dump_utf_json(orgs, JSON_FNAME)
def dump_orgs(): orgs = list() for year in range(2012, 2018): print("Parsing %d..." % year) orgs.extend(parse_xls(year)) dump_utf_json(orgs, JSON_FNAME)
def dump_raw_orgs(): orgs = list() urls = load_utf_json(URL_JSON_FNAME) ind = -1 for url in urls: ind += 1 org = scrape_org(url) print(ind) print(org[ORGNAME]) print() orgs.append(org) dump_utf_json(orgs, RAW_JSON_FNAME)
def add_fields(source_json, target_json=None, **fields): if not target_json: target_json = source_json entries = load_utf_json(source_json) total = len(entries) count = 0 for entry in entries: count += 1 print("\r{} / {}".format(count, total), end='', flush=True) for fieldname, val in fields.items(): entry[fieldname] = val print("\nDumping...") dump_utf_json(entries, target_json)
def make_json(): orgs2016 = scrape(url=URL2016, beg=5, diff=3, field=COMMENT, field_n_a=VIOLATIONS) orgs2017 = scrape(url=URL2017, beg=6, diff=2, field=VIOLATIONS, field_n_a=COMMENT) orgs_2016_2017 = orgs2016 + orgs2017 dump_utf_json(orgs_2016_2017, JSON_FNAME)
def duplicate_entry(source_json, target_json=None, **fields): if not target_json: target_json = source_json entries = load_utf_json(source_json) total = len(entries) for indx in range(len(entries)): count = indx + 1 entry = entries[indx] print("\r{} / {}".format(count, total), end='', flush=True) for fieldname, val in fields.items(): if entry[fieldname] != val: break else: entries.insert(count, entry) print("\nFound at {}, inserted at {}!".format(indx, count)) print("Dumping...") dump_utf_json(entries, target_json) return print("\nNo such entry!")
def dump_urls(): dump_utf_json(scrape_urls(), URL_JSON_FNAME)