def main(): args = get_args() ch = CompaniesHouseAPI(_API_KEY, int(args.ratelimit)) _LAST_NUM_SC = 0 _LAST_NUM_BR = 0 empty_counter = 0 empty_limit = int(args.empty_limit) with open(args.last_file, 'r+') as last_file: data = json.load(last_file) _LAST_NUM_BR = int(data["british_company_last_number"]) _LAST_NUM_SC = int(data["scottish_company_last_number"]) # British companies with open(args.result_file, "a+", newline='') as res: res.write( "Company, Fullname, Address, Country, City, Postal Code\n") writer = csv.writer(res) while True: _LAST_NUM_BR += 1 details = get_company_details(_LAST_NUM_BR, ch) print(details) if not details: # happens only if API returned http error or company doesn't meet our requirements continue if details == -1: print("Empty counter 1 " + str(empty_counter)) if empty_counter == empty_limit: _LAST_NUM_BR = _LAST_NUM_BR - 1 print("Empty counter 2 " + str(empty_counter)) break else: empty_counter += 1 continue empty_counter = 0 writer.writerows(details) # Scottish companies empty_counter = 0 while True: _LAST_NUM_SC += 1 details = get_company_details("SC" + str(_LAST_NUM_SC), ch) if not details: continue if details == -1: if empty_counter == empty_limit: _LAST_NUM_SC = _LAST_NUM_SC - 1 break else: empty_counter += 1 continue empty_counter = 0 writer.writerows(details) data[ "british_company_last_number"] = _LAST_NUM_BR - empty_limit # because we are checking 100 extra numbers data["scottish_company_last_number"] = _LAST_NUM_SC - empty_limit last_file.seek(0) last_file.truncate() json.dump(data, last_file) exit(0)
def get_director(number: str, ch: CompaniesHouseAPI) -> str: director: str = "" psc = ch.list_company_officers(company_number=number) if not psc: psc = ch.list_company_persons_with_significant_control( company_number=number) if not psc: psc = ch.list_company_persons_with_significant_control_statements( company_number=number) if not psc: return None if psc.get("active_count") == 1: officers = psc.get("items") for officer in officers: if officer.get("officer_role") == "director": director = officer.get("name") return director
def get_company_details(number: str, ch: CompaniesHouseAPI) -> list: company: dict = {} res = None try: company = ch.get_company(company_number=number) except HTTPError as e: print("Companies House API returned error %sn " % str(e)) # Sometimes companies house returns 502 sleep(15) # we ill just wait 15 seconds and than retry company = ch.get_company(company_number=number) if not company: res = None if company: # checking for empty dict creation_date = datetime.datetime.strptime( company.get("date_of_creation"), "%Y-%m-%d").date() time_delta = (datetime.datetime.now().date() - creation_date).days print("Company was registered " + str(time_delta) + " days ago") if company.get( "company_status" ) == "active" and "registered_office_address" in company and company.get( 'type') == "ltd": director = get_director(number, ch) name = company["company_name"] if director: address, country, city, postal_code = get_address(company) print(name) print(director) print(address) print(number) res = [[ str(name).replace(',', ' '), str(director).replace(',', ' '), str(address).replace(',', ' '), str(country).replace(',', ' '), str(city).replace(',', ' '), str(postal_code).replace(',', ' ') ]] return res else: res = -1 print(str(number) + " company does not exist or meet our requirements") return res
#!/home/kimgid/.virtualenvs/myproject/bin/python from companies_house.api import CompaniesHouseAPI ch = CompaniesHouseAPI('HwcI7GpyQ7KzwZjE9lf0cqXIlDU1M6dy0CyzgCvQ') import pandas as pd import numpy as np import matplotlib.pyplot as plt from fuzzywuzzy import fuzz import time import datetime import requests import pytesseract from pdf2image import convert_from_path from PIL import Image import json import os import re """url = "https://beta.companieshouse.gov.uk//company/05396788/filing-history/MzI2MDEzNTAwMmFkaXF6a2N4/document?format=pdf" r = requests.get(url) with open('test1.pdf', 'wb') as fp: fp.write(r.content)""" # Load today's links with open('todays_links.json', 'r') as fp: data = json.load(fp) matches = [] # Create test tech giants to check
#!/home/kimgid/.virtualenvs/myproject/bin/python from companies_house.api import CompaniesHouseAPI ch = CompaniesHouseAPI('HwcI7GpyQ7KzwZjE9lf0cqXIlDU1M6dy0CyzgCvQ') import pandas as pd import numpy as np import matplotlib.pyplot as plt from fuzzywuzzy import fuzz import time allcompanies2010 = pd.read_csv('data2010.csv').to_dict(orient='records') company_names1 = [] for i in allcompanies2010: company_names1.append(i['0']) companydata = [] for i in company_names1: d = {} time.sleep(0.55) d['search'] = i d['companiesinsearch'] = ch.search_companies(q=i) d['numbercompanies'] = len(d['companiesinsearch']['items']) companydata.append(d) print(d) companydata2 = [] for i in companydata: for j in i['companiesinsearch']['items']: score = fuzz.token_sort_ratio(j['title'], i['search']) m = {} m['search'] = i['search']
with open('player_list.csv', 'r', encoding='utf-8-sig') as f: csv_reader = csv.DictReader(f) for row in csv_reader: d = dict() d['club'] = row['Club'] d['name'] = row['Name'] d['position'] = row['Position'] d['country'] = row['Country'] player_list.append(d) return player_list if __name__ == '__main__': with open('config.yaml', 'r') as f: config = yaml.safe_load(f) ch = CompaniesHouseAPI(config['chapi_key']) players = load_player_list() if len(players) > 600: print(f'Warning: {len(players)} will likely cause the API rate limit to be exceeded.') for player in players[0:10]: results = ch.search_officers(q=player['name']) print(f"{player['name']}: {results['total_results']}") for item in results['items']: found_name = item['title'] lev = levenshtein_score(player['name'], found_name) if lev > 0.9: print(f"good hit: {found_name} >< {player['name']} ({lev})") else:
#!/home/kimgid/.virtualenvs/myproject/bin/python from companies_house.api import CompaniesHouseAPI ch = CompaniesHouseAPI('HwcI7GpyQ7KzwZjE9lf0cqXIlDU1M6dy0CyzgCvQ') import pandas as pd import numpy as np import json import matplotlib.pyplot as plt from fuzzywuzzy import fuzz import time import datetime # Read the pre-gathered list of UK startup names allcompanies2010 = pd.read_csv('companydata3.csv').to_dict(orient='records') # Call API for filing history todayslinks = [] df2010 = [] for j, i in enumerate(allcompanies2010): f = {} f['chnumber'] = i['chnumber'] f['company_name'] = i['officialname'] f['search'] = i['search'] print('checking:') print(f['company_name']) try: f['filhis'] = ch.list_company_filing_history( company_number=f['chnumber'], items_per_page='10')['items'] for d in f['filhis']: b = {}
def setUp(self): self.base_api: CompaniesHouseAPIBase = CompaniesHouseAPIBase(API_KEY) self.api: CompaniesHouseAPI = CompaniesHouseAPI(API_KEY)
from companies_house.api import CompaniesHouseAPI from datetime import datetime from string import capwords import pprint api_key = 'gHggW0wcFUkPigIifYRo864nCxGBqqIYMLm3Pd_O' ch = CompaniesHouseAPI(api_key) company_request_cache = {} officers_request_cache = {} def get_officers(company_number): officers_request = None if company_number in officers_request_cache is True: officers_request = officers_request_cache[company_number] else: company_request = get_company(company_number) if company_request is not None: officers_request = ch.list_company_officers( company_number=company_number) if officers_request is not None: ps = datetime.fromisoformat( company_request['accounts']['next_accounts'] ['period_start_on']) pe = datetime.fromisoformat(company_request['accounts'] ['next_accounts']['period_end_on'])