def assets(file_path, no_commit=False): from mptracker.scraper.assets import parse_assets from mptracker.nlp import normalize asset_patcher = TablePatcher( models.AssetStatement, models.db.session, key_columns=['person_id', 'date'], ) people_map = { normalize(person.name): person.id for person in ( models.Person.query .join(models.Person.mandates) .filter_by(year=2012) ) } with asset_patcher.process(remove=True) as add_asset: for record in parse_assets(file_path): person_name = normalize(record.pop('person_name')) person_id = people_map[person_name] del record['constituency'] del record['county'] res = add_asset({ 'person_id': person_id, 'date': date(2012, 11, 1), 'raw_data': record, 'net_worth_eur': ( record['acct_value']['TOTAL_EUR'] - record['debt_value']['TOTAL_EUR'] + record['invest_value']['TOTAL_EUR'] + record['valuables_value']['TOTAL_EUR'] ), 'land_agri_area': record['land_agri_area'], 'land_city_area': record['land_city_area'], 'realty_count': ( record['realty_apartment_count'] + record['realty_business_count'] + record['realty_house_count'] ), 'vehicle_count': record['vehicle_count'], 'year_income_eur': ( record['family_income_value']['TOTAL_EUR'] + record['gift_value']['TOTAL_EUR'] + record['sales_value']['TOTAL_EUR'] ), }) if no_commit: logger.warn("Rolling back the transaction") models.db.session.rollback() else: models.db.session.commit()
def get_romania_curata(): from os import path from difflib import SequenceMatcher as sm from itertools import permutations import json from mptracker.nlp import normalize sql_names = [person.name for person in models.Person.query.all()] with open(path.relpath("mptracker/scraper/scraper_curata_out.json"), 'r', encoding='utf-8') as f: scraper_result = json.load(f) with open(path.relpath( 'mptracker/scraper/romania_curata_exceptions.json'), 'r', encoding='utf-8') as f: person_exceptions = json.load(f) def matching_score(first_name, second_name): return sm(None, first_name, second_name).ratio() * 100 def add_person(name, fortune): person = ( models.Person.query .filter_by(name=name) .first() ) if person != None: person.romania_curata = "\n".join(fortune) print("Found a match for ", name.encode('utf-8')) sql_names.remove(name) for name, fortune in scraper_result: name_scraper = normalize(name) max_matching = (0, 0) if name_scraper in person_exceptions: add_person(person_exceptions[name_scraper], fortune) for temporary_sqlname in sql_names: name_sql = normalize(temporary_sqlname) for perm in permutations(name_scraper.split(" ")): current_matching = matching_score(" ".join(perm), name_sql) if max_matching[0] < current_matching: max_matching = (current_matching, temporary_sqlname) if max_matching[0] > 93: add_person(max_matching[1], fortune) models.db.session.commit()
def get_romania_curata(): from os import path from difflib import SequenceMatcher as sm from itertools import permutations import json from mptracker.nlp import normalize sql_names = [person.name for person in models.Person.query.all()] with open(path.relpath("mptracker/scraper/scraper_curata_out.json"), 'r', encoding='utf-8') as f: scraper_result = json.load(f) with open(path.relpath('mptracker/scraper/romania_curata_exceptions.json'), 'r', encoding='utf-8') as f: person_exceptions = json.load(f) def matching_score(first_name, second_name): return sm(None, first_name, second_name).ratio() * 100 def add_person(name, fortune): person = (models.Person.query.filter_by(name=name).first()) if person != None: person.romania_curata = "\n".join(fortune) print("Found a match for ", name.encode('utf-8')) sql_names.remove(name) for name, fortune in scraper_result: name_scraper = normalize(name) max_matching = (0, 0) if name_scraper in person_exceptions: add_person(person_exceptions[name_scraper], fortune) for temporary_sqlname in sql_names: name_sql = normalize(temporary_sqlname) for perm in permutations(name_scraper.split(" ")): current_matching = matching_score(" ".join(perm), name_sql) if max_matching[0] < current_matching: max_matching = (current_matching, temporary_sqlname) if max_matching[0] > 93: add_person(max_matching[1], fortune) models.db.session.commit()
def explode(self, name): return frozenset(normalize(name).replace('-', ' ').split())