def save_to_csv(self, response, **meta): il = ItemLoader(item=AlMassageTherapyLicensesSpiderItem(), response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'AL_Massage_Therapy_Licenses') il.add_value('url', 'http://www.almtbd.alabama.gov/licensee.aspx') il.add_value('category', meta['category']) il.add_value('company_name', self._getDBA(meta['company_name'])[0]) il.add_value('dba_name', self._getDBA(meta['company_name'])[1]) il.add_value('approved by', meta['approved_by']) il.add_value('permit_lic_no', meta['permit_lic_no']) il.add_value('renewal date', meta['Renewal_Date']) il.add_value('permit_lic_status', meta['permit_lic_status']) il.add_value('location_address_string', meta['location_address_string']) il.add_value('mail_address_string', meta['mailing_address']) il.add_value('person_name', meta['person_name']) il.add_value('person_subtype', meta['person_subtype']) il.add_value('company_phone', meta['company_phone']) il.add_value('company_fax', meta['fax']) il.add_value('permit_lic_eff_date', meta['permit_lic_eff_date']) il.add_value('permit_lic_exp_date', meta['permit_lic_exp_date']) il.add_value('approved date', meta['approved_date']) il.add_value('company_email', meta['email']) il.add_value('company_website', meta['website_address']) il.add_value('permit_lic_desc', meta['permit_lic_desc']) il.add_value('permit_type', 'therapy_license') return il.load_item()
def save_to_csv(self, response, **meta): il = ItemLoader(item=IlKankakeeFoodInspectionsSpiderItem(), response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'IL_Kankakee_Food_Inspections') il.add_value( 'url', 'http://www.kankakeehealth.org/environmental-health/food-sanitation/food_inspections.html' ) il.add_value('violation_date', meta['violation_date']) il.add_value('permit_lic_no', meta['permit_lic_no']) il.add_value('location_address_string', meta['location_address_string']) il.add_value('inspector_comments', meta['inspector_comments']) il.add_value('inspection_date', meta['inspection_date']) il.add_value('company_name', meta['company_name']) il.add_value('violation_rule_id', meta['rule_id']) il.add_value('violation_subtype', meta['violation_subtype']) il.add_value('inspection_pass_fail', meta['inspection_pass_fail']) il.add_value('violation category', meta['violation_category']) il.add_value('dba_name', meta['dba_name']) il.add_value('inspection_type', meta['inspection_type']) il.add_value('violation_description', meta['violation_description']) il.add_value('risk category', meta['risk']) il.add_value('abate_date', meta['abate_date']) il.add_value('abate_status', meta['abate_status']) il.add_value('temperature observations-item/location', meta['temperature']) il.add_value('inspection_subtype', meta['inspection_subtype']) il.add_value('violation_rule', meta['rule']) il.add_value('permit_lic_desc', meta['permit_lic_desc']) il.add_value('permit_type', 'restaurant_license') il.add_value('violation_type', meta['violation_type']) return il.load_item()
def parse_row(self, response, row): # print(row) il = ItemLoader(item=IlHospitalLicensesSpiderItem()) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value( 'url', 'https://data.illinois.gov/dataset/410idph_hospital_directory/resource/9bdedb85-77f3-490a-9bbd-2f3f5f227981' ) il.add_value('sourceName', 'IL_Renal_Desease_Facility_Licenses') il.add_value('permit_type', "medical_license") name = self._getDBA(row['End Stage Renal Disease']) company_name = str(name[0]).replace(' -', '') if ' -' in str( name[0]) else name[0] address = self.format__address_4( row['Address'], row['City'], 'IL', str(row['Zip']) if '.' not in str(row['Zip']) else str( row['Zip'])[:str(row['Zip']).rfind('.')]) il.add_value('dba_name', name[1]) il.add_value('permit_lic_no', row.get('Medicare #', '')) il.add_value( 'permit_lic_exp', self.format_date(row.get('Exp. Date', '')) if row.get('Exp. Date') else '') il.add_value('company_name', company_name) il.add_value('location_address_string', address) il.add_value('county', row.get('County', '')) il.add_value( 'permit_lic_desc', "Medical License for " + company_name if name[0] else "Medical License") il.add_value('company_phone', row.get('Phone', '')) il.add_value( 'company_subtype', row.get('Type', '') if row.get('Type', '') else 'Medical License') yield il.load_item()
def save_to_csv(self, response, **meta): il = ItemLoader(item=IaJohnsonIowacityBuildingPermitsSpiderItem(), response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'IA_Johnson_IowaCity_Building_Permits') il.add_value('url', 'http://www.iowa-city.org/IcgovApps/Tidemark/Search') il.add_value('permit_lic_no', meta['case_number']) il.add_value('permit_lic_status', meta['status']) il.add_value('location_address_string', meta['address']) il.add_value('permit_lic_desc', meta['description']) il.add_value('case actions-date', meta['case_action_date']) il.add_value('case actions-description', meta['case_action_description']) il.add_value('case actions-status', meta['case_action_status']) il.add_value('case action-notes', meta['case_action_notes']) il.add_value('inspection_date', meta['inspection_date']) il.add_value('inspection_type', meta['inspection_type']) il.add_value('inspection_pass_fail', meta['inspection_pass_fail']) il.add_value('inspection_description', meta['inspection_description']) il.add_value('violation_date', meta['violation_date']) il.add_value('violation_type', meta['violation_type']) il.add_value('permit_type', 'building_permit') return il.load_item()
def save_to_csv(self,response,**meta): il = ItemLoader(item=WaKittitasBuildingPermitsSpiderItem()) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'WA_Kittitas_Building_Permits') il.add_value('url', 'https://www.co.kittitas.wa.us/cds/building/reports.aspx') il.add_value('report date',meta['date']) il.add_value('permit_lic_no',meta['permit_number']) il.add_value('permit_subtype',meta['permit_type']) il.add_value('permit_lic_desc',meta['permit_lic_desc']) il.add_value('location_address_string',meta['address']) if meta['valuation']: meta['valuation']=meta['valuation'].replace('$0.00','') il.add_value('permit_lic_value',meta['valuation']) il.add_value('permit_lic_fee',meta['fees']) if meta['owner_name']: company_names=meta['owner_name'] meta['owner_name']=self._getDBA(company_names)[0] meta['dba_name']=self._getDBA(company_names)[1] il.add_value('mixed_name',meta['owner_name']) il.add_value('dba_name',meta['dba_name']) il.add_value('mixed_subtype',meta['mixed_subtype']) il.add_value('mail_address_string',meta['mailing']) il.add_value('contractor_company',meta['contractor']) il.add_value('contractor_dba',meta['contractor_dba']) il.add_value('parcel #',meta['parcel_number']) il.add_value('permit_lic_eff_date',meta['issue_date']) il.add_value('permit_type', 'building_permit') return il.load_item()
def spider_opened(self, spider): if hasattr(spider, 'start') and spider.start: l = list(os.path.splitext(self.file_name)) remove_spec = lambda x: ''.join(e for e in x if e.isalnum()) l.insert( 1, "_{}_{}".format(remove_spec(spider.start), remove_spec(spider.end))) self.file_name = "".join(l) if self.appendMode: outpath = os.path.join( settings.get('STORAGE_DIR'), self._settings.get('JIRA_ID'), 'resume_{}'.format(Utils.getingestion_timestamp()), self.file_name if self.file_name else '%s_items.csv' % spider.name) else: outpath = os.path.join( settings.get('STORAGE_DIR'), self._settings.get('JIRA_ID'), self.file_name if self.file_name else '%s_items.csv' % spider.name) self.createFolder(outpath) self.file = open(outpath, 'w+b') kwargs = {'delimiter': self.delimiter} if self.fields_to_export: kwargs['fields_to_export'] = self.fields_to_export if self.null_header: kwargs['null_header'] = self.null_header self.exporter = CustomCsvItemExporter(self.file, **kwargs) self.exporter.start_exporting()
def save_csv(self, response, data_dic): il = ItemLoader(item=HiSosSpiderItem(), response=response) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'HI_SOS') il.add_value('url', 'https://hbe.ehawaii.gov/documents/search.html') il.add_value('permit_type', 'business_license') for k in data_dic: il.add_value(k, data_dic[k]) return il
def save_csv(self,response,data_dic): il = ItemLoader(item=IlAgricultureLicensesSpiderItem()) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'IL_Agriculture_Licenses') il.add_value('permit_type', 'agriculture_license') il.add_value('url', 'https://www2.illinois.gov/sites/agr/licenses/Pages/A-Z-License-List.aspx') for k in data_dic: il.add_value(k,(data_dic[k])) return il
def save_to_csv(self, response, **meta): il = ItemLoader(item=WaKittitasBuildingPermitsSpiderItem()) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'WA_Kittitas_Building_Permits') il.add_value( 'url', 'https://www.co.kittitas.wa.us/cds/building/reports.aspx') il.add_value('report date', meta['date']) il.add_value('permit_lic_no', meta['permit_number']) il.add_value('permit_subtype', meta['permit_type']) il.add_value('permit_lic_desc', meta['permit_lic_desc']) add = meta['address'].split(',') if len(add) > 3: meta['address'] = re.split("WA \d+", meta['address'])[0] + re.search( "WA \d+", meta['address']).group() else: if 'PERMIT' in meta['address']: meta['address'] = 'WA' else: meta['address'] = meta['address'] if ':,' in meta['address'] or ':AL,' in meta['address']: meta['address'] = meta['address'].replace(':AL,', ':,') meta['address'] = meta['address'].split(':,')[1] if ',' not in meta['address']: meta['address'] = meta['address'] + ', WA' il.add_value( 'location_address_string', meta['address'].replace('Address:', 'WA').replace('WA, WA', 'WA')) il.add_value('permit_lic_value', meta['valuation']) il.add_value('permit_lic_fee', meta['fees']) if meta['owner_name']: company_names = meta['owner_name'] meta['company_name'] = self._getDBA(company_names)[0] meta['dba_name'] = self._getDBA(company_names)[1] il.add_value('mixed_name', meta['company_name']) il.add_value('dba_name', meta['dba_name']) il.add_value('mixed_subtype', meta['mixed_subtype']) il.add_value('mail_address_string', meta['mailing']) il.add_value('contractor_company', meta['contractor']) il.add_value('contractor_dba', meta['contractor_dba']) if 'T' in meta['parcel_number'] or 'F' in meta[ 'parcel_number'] or 'M' in meta[ 'parcel_number'] or 'B' in meta['parcel_number']: meta['parcel_number'] = meta['parcel_number'].replace( 'B', 'T').replace('M', 'T').replace('F', 'T') il.add_value('parcel #', meta['parcel_number'].split('T')[0]) else: il.add_value('parcel #', meta['parcel_number']) if ':' in meta['issue_date']: il.add_value('permit_lic_eff_date', meta['issue_date'].split(':')[1]) else: il.add_value('permit_lic_eff_date', meta['issue_date']) il.add_value('permit_type', 'building_permit') return il.load_item()
def save_csv(self,response,data_dic): il = ItemLoader(item=AlFoodInspectionsSpiderItem(),response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags,lambda data:re.sub(r'\s+', ' ',data) if data else '',replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'AL_Food_Inspections') il.add_value('url', 'http://www.alabamapublichealth.gov/foodscores/index.html') for k in data_dic: il.add_value(k,(data_dic[k])) return il
def save_csv(self, response, data_dic): il = ItemLoader(item=MeSosSpiderItem(), link_page=response) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'ME_SOS') il.add_value('permit_type', 'business_license') il.add_value('url', 'https://icrs.informe.org/nei-sos-icrs/ICRS?MainPage=x') for k in data_dic: il.add_value(k, (self.remove_tag(data_dic[k]))) return il
def save_csv(self, response, data_dic): il = ItemLoader(item=OhSosSpiderItem(), response=response) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'OH_SOS') il.add_value( 'url', 'https://www5.sos.state.oh.us/ords/f?p=100:1:::NO:1:P1_TYPE:NAME') il.add_value('permit_type', 'business_license') for k in data_dic: il.add_value(k, data_dic[k]) return il
def parse_details(self, response): tr_list=response.xpath('//*[@id="ctl00_ContentPlaceHolder1_dtgResults"]//tr')[1:] for tr in tr_list: link=tr.xpath('td[10]/a/@href').extract_first() company_name=tr.xpath('td[4]/text()').extract_first() f_name=tr.xpath('td[1]/text()').extract_first() m_name=tr.xpath('td[2]/text()').extract_first() l_name=tr.xpath('td[3]/text()').extract_first() person_name=self.format_name(f_name,m_name,l_name) if company_name and len(company_name) > 2: company_name=company_name else: company_name=person_name if link: link_url='https://alboc.glsuite.us/GLSuiteWeb/Clients/ALBOC/public/'+str(link) parse_res=yield scrapy.Request(url=link_url,dont_filter=True) add=parse_res.xpath('//*[contains(text(),"City")]/following-sibling::td/span/text()').extract_first() state=parse_res.xpath('//*[contains(text(),"State")]/following-sibling::td/span/text()').extract_first() if add and state: location_address_string=add+', '+state else: location_address_string=state permit_lic_no=parse_res.xpath('//*[contains(text(),"License Number")]/following-sibling::td/span/text()').extract_first() permit_subtype=parse_res.xpath('//*[contains(text(),"License Type")]/following-sibling::td/span/text()').extract_first() permit_lic_exp_date=parse_res.xpath('//*[contains(text(),"License Expiration Date")]/following-sibling::td/span/text()').extract_first() permit_lic_status=parse_res.xpath('//*[contains(text(),"License Status")]/following-sibling::td/span/text()').extract_first() disciplinary_action=parse_res.xpath('//*[contains(text(),"Disciplinary Action")]/following-sibling::td/span/text()').extract_first() il = ItemLoader(item=AlCosmetologyLicensesSpiderItem(),response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('url', 'https://alboc.glsuite.us/GLSuiteWeb/Clients/ALBOC/public/VerificationSearch.aspx') il.add_value('sourceName', 'AL_Cosmetology_Licenses') il.add_value('permit_lic_exp_date',permit_lic_exp_date) il.add_value('permit_lic_status',permit_lic_status) il.add_value('person_name', person_name) il.add_value('violation_type', '') il.add_value('disciplinary action', disciplinary_action) il.add_value('permit_lic_desc', ('Cosmetology License for'+' '+str(company_name)) if company_name and len(company_name)>2 else 'Cosmetology License') il.add_value('permit_type', 'cosmetology_license') il.add_value('location_address_string', location_address_string if location_address_string and len(location_address_string) > 2 else 'AL') il.add_value('permit_lic_no', permit_lic_no) il.add_value('company_name', company_name) il.add_value('permit_subtype', permit_subtype) yield il.load_item() pageee=response.xpath('//td[@colspan="10"]/span/following-sibling::a/@href').extract_first() if pageee: page_link=JavaScriptUtils.getValuesFromdoPost(pageee) page_data={'__EVENTTARGET':page_link['__EVENTTARGET'],'__EVENTARGUMENT':page_link['__EVENTARGUMENT'],'__VIEWSTATE':response.xpath('//*[@id="__VIEWSTATE"]/@value').extract_first(), '__VIEWSTATEGENERATOR':response.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value').extract_first(),'__EVENTVALIDATION':response.xpath('//*[@id="__EVENTVALIDATION"]/@value').extract_first(),'__VIEWSTATEENCRYPTED':response.xpath('//*[@id="__VIEWSTATEENCRYPTED"]/@value').extract_first()} yield scrapy.FormRequest(url=response.url,method='POST',formdata=page_data,callback=self.parse_details,dont_filter=True) elif len(self.search_element_a)>0: yield scrapy.Request(url=self.start_urls[0], callback=self.parse, dont_filter=True) elif len(self.search_element)>0: self.check_first=True yield scrapy.Request(url=self.start_urls[0], callback=self.parse, dont_filter=True)
def save_csv(self, response, data_dic): il = ItemLoader(item=GaHenryBuildingPermitsSpiderItem(), response=response) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'GA_Henry_Building_Permits') il.add_value( 'url', 'https://www.sagesgov.com/henrycounty-ga/Portal/Search.aspx') il.add_value('permit_type', 'building_permit') for k in data_dic: il.add_value(k, (data_dic[k])) return il
def parse_row(self, response, row): il = ItemLoader(item=IlAsbestosWorkerLicensesSpiderItem()) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'IL_Asbestos_Worker_Licenses') il.add_value('url', 'https://data.illinois.gov/dataset/378idph_asbestos_licensed_workers/resource/f3266216-1c0e-4326-acb7-0f4341d1b463') il.add_value('person_address_string', self.format__address_4(row['Expr1'],row['tech_city'],row['tech_state'],row['tech_zip'])) il.add_value('person_name', row['tech_name']+' '+row['LAST_NAME']) il.add_value('permit_lic_desc', 'Asbestos Contractor License') il.add_value('dba_name', '') il.add_value('person_phone', row['Expr2']) il.add_value('county', row['COUNTY']) il.add_value('permit_lic_no', '0'+row['lic_id_number'] if len(row['lic_id_number'])<9 else row['lic_id_number'] ) il.add_value('permit_type', 'asbestos_contractor_license') yield il.load_item()
def save_to_csv(self, response, **meta): il = ItemLoader(item=OrAlcoholServerEducatorLicensesSpiderItem()) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'OR_Alcohol_Server_Educator_Licenses') il.add_value('url', meta['url']) il.add_value('type', meta['type_val']) il.add_value('company_name', self._getDBA(meta['company_name'])[0]) il.add_value('dba_name', self._getDBA(meta['company_name'])[1]) il.add_value('classes in/online course in', meta['class_in']) il.add_value('location_address_String', 'OR') il.add_value('company_phone', meta['phone'].replace('Phone:', '')) il.add_value('company_website', meta['email']) return il.load_item()
def save_to_csv(self, response, **data_pass): il = ItemLoader(item=VaMecklenburgBuildingPermitsSpiderItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'VA_Mecklenburg_Building_Permits') il.add_value( 'url', 'https://webpermit.mecklenburgcountync.gov/Default.aspx?PossePresentation=SearchByAddress' ) il.add_value('person_address_string', data_pass['person_address_string']) il.add_value('permit_lic_no', data_pass['permit_lic_no']) il.add_value('master #', data_pass['master #']) il.add_value('submittal #', data_pass['submittal #']) il.add_value('permit_subtype', data_pass['permit_subtype']) il.add_value('permit_lic_status', data_pass['permit_lic_status']) il.add_value('location_address_string', data_pass['location_address_string']) il.add_value('parcel #', data_pass['parcel #']) il.add_value('occupancy_subtype', data_pass['occupancy type']) il.add_value('permit_subtype', data_pass['permit_subtype']) il.add_value('occupancy type', data_pass['occupancy type']) il.add_value('usdc code', data_pass['usdc code']) il.add_value('type of building', data_pass['type of building']) il.add_value('equipment type', data_pass['equipment type']) il.add_value('permit_lic_fee', data_pass['permit_lic_fee']) il.add_value('mixed_name', data_pass['mixed_name']) il.add_value('mixed_subtype', data_pass['mixed_subtype']) il.add_value('mixed_phone', data_pass['mixed_phone']) il.add_value('mixed_contractor_name', data_pass['mixed_contractor_name']) il.add_value('contractor id', data_pass['contractor id']) il.add_value('contractor_phone', data_pass['contractor_phone']) il.add_value('contractor_lic_no', data_pass['contractor_lic_no']) il.add_value('contractor_lic_type', data_pass['contractor_lic_type']) il.add_value('contractor_address_string', data_pass['contractor_address_string']) il.add_value('inspection_id', data_pass['inspection_id']) il.add_value('inspection_subtype', data_pass['inspection_subtype']) il.add_value('inspection_date', data_pass['inspection_date']) il.add_value('inspection_pass_fail', data_pass['inspection_pass_fail']) il.add_value('inspection_type', data_pass['inspection_type']) il.add_value('permit_type', 'building_permit') return il.load_item()
def parse_pdf(self, response): for row in self.__extractData(response): for col in row: # d = re.search(r"[\d]/[\d]/[\d]$", col['expiration']) # if d: # self.state['items_count'] = self.state.get('items_count', 0) + 1 il = ItemLoader(item=CtForestPractitionerLicenseSpiderItem()) il.default_input_processor = MapCompose( lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value( 'url', 'https://www.depdata.ct.gov/forestry/ForestPractitioner/directry.pdf' ) il.add_value('sourceName', 'CT_Forest_Practitioner_License') il.add_value('person_phone', col['phone']) name = col['f_name'] + ' ' + col['l_name'] il.add_value('person_name', name) if ' ' in col['expiration']: date = col['expiration'].split(' ')[0] e_permit = col['expiration'].split(' ')[1] else: date = col['expiration'] e_permit = '' print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@2", date) il.add_value('permit_lic_exp_date', date) if '490' in e_permit: e_permit = "490- permitted to assist landowners seeking classification of their land as 'Forest Land'" il.add_value('extended permit', e_permit) il.add_value('permit_lic_no', col['cert']) level_desc = col['level'] if level_desc == 'F': level_desc = 'FORESTER' elif level_desc == 'SFPH': level_desc = 'SUPERVISING FOREST PRODUCTS HARVESTER' elif level_desc == 'FPH': level_desc = 'FOREST PRODUCTS HARVESTER' il.add_value('level', col['level']) il.add_value('permit_subtype', level_desc) il.add_value('permit_lic_desc', level_desc) il.add_value('permit_type', 'forester_license') location_address_string = col['address'] + ', ' + col[ 'city'] + ', ' + col['state'] + ' ' + col['zip'] il.add_value('location_address_string', location_address_string) yield il.load_item()
def save_to_csv(self, response, data_dic): il = ItemLoader(item=NhSosSpiderItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('url', 'https://quickstart.sos.nh.gov/online/BusinessInquire') il.add_value('sourceName', 'NH_SOS') il.add_value( 'creation_date', data_dic['business_creation_date'].replace('NOT-AVAILABLE', '')) il.add_value( 'dba_name', data_dic['dba_name'] if data_dic['dba_name'] and len(data_dic['dba_name']) > 3 else self._getDBA(data_dic['company_name'])[1]) il.add_value('non_profit_indicator', data_dic['non_profit_indicator']) il.add_value('mail_address_string', data_dic['mailing_address_string']) il.add_value('status', data_dic['business_status']) il.add_value('citizenship / state of formation', data_dic['state_of_formation']) il.add_value('duration', data_dic['duration']) il.add_value( 'mixed_name', '' if data_dic['mixed_name'] is None else data_dic['mixed_name']) il.add_value('company_name', self._getDBA(data_dic['company_name'])[0]) il.add_value('company_phone', data_dic['phone'].replace('NONE', '')) il.add_value('inactive_date', data_dic['inactive_date']) il.add_value('homestate name', self._getDBA(data_dic['host_name'])[0]) il.add_value('naics_description', data_dic['naics_description']) il.add_value('permit_type', 'business_license') il.add_value('mixed_subtype', data_dic['mixed_subtype']) il.add_value('previous name', data_dic['previous_name']) il.add_value('company_subtype', self._getDBA(data_dic['business_type'])[0]) il.add_value('entity_id', data_dic['business_id']) il.add_value( 'location_address_string', data_dic['location_address_string'] if data_dic['location_address_string'] and len(data_dic['location_address_string']) > 5 else 'NH') il.add_value('company_email', data_dic['business_mail'].replace('NONE', '')) il.add_value('person_address_string', data_dic['person_address_string']) return il
def save_csv(self,response,main_res,permit_lic_no): location_address_string=rem_esc(main_res.xpath("//em[contains(text(),'Location')]/following::text()").extract_first()) plat_lot=rem_esc(''.join(main_res.xpath('//em[contains(text(),"Plat")]/following::text()').extract()[:2])) Owner_name=rem_esc(main_res.xpath("//em[contains(text(),'Owner Name')]/following::text()").extract_first()) corp_owner=rem_esc(main_res.xpath("//em[contains(text(),'Corp Owner')]/following::text()").extract_first()) designer=rem_esc(main_res.xpath("//em[contains(text(),'Designer')]/following::text()").extract_first()) total=rem_esc(main_res.xpath("//em[contains(text(),'Total')]/following::text()").extract_first()) plat='' lot='' sublot='' if plat_lot: if 'Plat' in plat_lot and 'Lot' in plat_lot and 'Sublot' in plat_lot : plat=re.search('Plat.*Lot',plat_lot).group()[4:-3].strip() lot= re.search('Lot.*Sublot',plat_lot).group()[3:-6].strip() sublot=re.search('Sublot.*',plat_lot).group()[6:].strip() elif 'Plat' in plat_lot and 'Lot' in plat_lot : plat=re.search('Plat.*Lot',plat_lot).group()[4:-3].strip() lot= re.search('Lot.*',plat_lot).group()[3:].strip() elif 'Plat' in plat_lot and 'Sublot' in plat_lot : plat=re.search('Plat.*Sublot',plat_lot).group()[4:-6].strip() lot= re.search('Sublot.*',plat_lot).group()[6:].strip() elif 'plat' in plat_lot.lower(): plat=re.search('Plat.*',plat_lot).group()[4:].strip() il = ItemLoader(item=RiSepticSystemLicensesSpiderItem(),response=response) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'RI_Septic_System_Licenses') il.add_value('url', 'https://www.ri.gov/DEM/isdssearch/') il.add_value('permit_lic_no', permit_lic_no) il.add_value('city/town', response.meta['city']) il.add_value('location_address_string', location_address_string.strip()+", RI") il.add_value('plat', (plat.upper().strip())[:-1] if plat.endswith('&') else plat.upper()) il.add_value('lot', (lot.upper().strip())[:-1] if lot.endswith('&') else lot.upper()) il.add_value('sublot', (sublot.upper().strip())[:-1] if sublot.endswith('&') else sublot.upper()) company_name=corp_owner if corp_owner.strip() else Owner_name if Owner_name.strip() else designer if designer.strip() else '' com_name=self._getDBA(company_name) designer_dba=self._getDBA(designer) permit_lic_desc='Septic System Licenses' if com_name[0]: permit_lic_desc+=" For "+com_name[0] il.add_value('company_name', com_name[0] if company[0].strip() else designer[0]) il.add_value('dba_name', com_name[1] if com_name[1] else designer[1]) il.add_value('person_name', designer[0]) il.add_value('total flow','' if 'Not available' in total else total) il.add_value('permit_lic_desc', permit_lic_desc) il.add_value('permit_type', 'utility_license') yield il.load_item()
def parse_main_page(self, response): value1 = json.loads(response.body_as_unicode()) value2 = value1['d'].replace('},{', '}~~{').split('[')[1].split(']')[0] value3 = value2.split('~~') for i in value3: json_acceptable_string = i.replace("\\", "").replace( '"administrative medicine"', "'administrative medicine'") d = json.loads(json_acceptable_string) person_name = d['FullName'] permit_subtype = d['LicenseType'] permit_lic_no = d['License_Number'] if d['Address1'] and d['City'] and d['Zip']: location_address_string = d['Address1'] + ', ' + d[ 'City'] + ' ' + d['Zip'] violation_description = d['Publicfile'] permit_lic_desc = 'Medical License for ' + str(person_name) violation_type = 'professional_violation' vio = d['Action_Date'] if '-' in vio: violation_date = '' else: violation_date = time.strftime( '%m/%d/%Y', time.gmtime(int(re.split('\(|\)', vio)[1]) / 1000.)) violation_subtype = d['ActionTaken'] il = ItemLoader(item=AlMedicalLicenseViolationsSpiderItem(), response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'AL_Medical_License_Violations') il.add_value( 'url', 'https://abme.igovsolution.com/online/Lookups/Publiclogfile.aspx' ) il.add_value('person_name', self._getDBA(person_name)[0]) il.add_value('dba_name', self._getDBA(person_name)[1]) il.add_value('permit_subtype', permit_subtype) il.add_value('permit_lic_no', permit_lic_no) il.add_value('location_address_string', location_address_string) il.add_value('violation_description', violation_description) il.add_value('permit_lic_desc', permit_lic_desc) il.add_value('violation_type', violation_type) il.add_value('violation_date', violation_date) il.add_value('violation_subtype', violation_subtype) il.add_value('permit_type', 'medical_license') yield il.load_item()
def save_to_csv(self, response, **meta): il = ItemLoader(item=IlChampaignBuildingPermitsSpiderItem(), response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value( 'url', 'http://etrakit.ci.champaign.il.us/etrakit3/Search/permit.aspx') il.add_value('sourceName', 'IL_Champaign_Building_Permits') il.add_value('finaled date', meta['finaled_date']) il.add_value('inspection_date', meta['inspection_date']) il.add_value('contractor_dba', meta['contractor_dba']) il.add_value('mixed_contractor_name', meta['mixed_contractor_name']) il.add_value('dba_name', meta['dba_name']) il.add_value('apn', meta['apn']) il.add_value('permit_lic_fee', meta['permit_lic_fee']) il.add_value('location_address_string', meta['location_address_string']) il.add_value('person_address_string', meta['person_address_string']) il.add_value('subtype', meta['subtype']) il.add_value('permit_subtype', meta['permit_subtype']) il.add_value('inspection_subtype', meta['inspection_subtype']) il.add_value('mixed_subtype', meta['mixed_subtype']) il.add_value('contractor_address_string', meta['contractor_address_string']) il.add_value('permit_lic_status', meta['permit_lic_status']) il.add_value('permit_lic_exp_date', meta['permit_lic_exp_date']) il.add_value('permit_lic_no', meta['permit_lic_no']) il.add_value('notes', meta['notes']) il.add_value('property type', meta['property_type']) il.add_value('mixed_name', meta['mixed_name']) il.add_value('inspection_pass_fail', meta['inspection_pass_fail']) il.add_value('approved date', meta['approved_date']) il.add_value('permit_lic_eff_date', meta['permit_lic_eff_date']) il.add_value('permit_applied_date', meta['permit_applied_date']) il.add_value('scheduled date', meta['scheduled_date']) il.add_value( 'permit_lic_desc', meta['permit_lic_desc'] if meta['permit_lic_desc'] and len(meta['permit_lic_desc']) > 2 else meta['permit_subtype'] if meta['permit_subtype'] and len(meta['permit_subtype']) > 2 else 'Building Permit') il.add_value('inspection_type', meta['inspection_type']) il.add_value('permit_type', 'building_permit') return il.load_item()
def save_to_csv(self, response, **meta_data): il = ItemLoader(item=FlClayBuildingPermitsSpiderItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('permit_lic_no', str(meta_data['permit_lic_no'])) il.add_value('permit_subtype', meta_data['permit_subtype']) il.add_value('permit_lic_desc', meta_data['permit_lic_desc']) il.add_value('location_address_string', meta_data['location_address_string']) il.add_value('permit_lic_eff_date', meta_data['permit_lic_eff_date']) il.add_value('notes', meta_data['notes']) il.add_value('mixed_name', meta_data['mixed_name']) il.add_value('mixed_subtype', meta_data['mixed_subtype']) il.add_value('person_address_string', meta_data['person_address_string']) il.add_value('mixed_contractor_name', meta_data['mixed_contractor_name']) il.add_value('contractor_lic_no', meta_data['contractor_lic_no']) il.add_value('contractor_lic_type', meta_data['contractor_lic_type']) il.add_value('permit_lic_value', meta_data['permit_lic_value']) if meta_data['number_of_stories'] == 'None': il.add_value('number_of_stories', '') else: il.add_value('number_of_stories', meta_data['number_of_stories']) if meta_data['year_built'] == 'None': il.add_value('year_built', '') else: il.add_value('year_built', meta_data['year_built']) il.add_value('inspection_id', meta_data['inspection_id']) il.add_value('inspection_date', meta_data['inspection_date']) il.add_value('inspection_subtype', meta_data['inspection_subtype']) il.add_value('inspection_pass_fail', meta_data['inspection_pass_fail']) il.add_value('inspector_comments', meta_data['inspector_comments']) il.add_value('inspection_type', meta_data['inspection_type']) il.add_value('permit_type', "building_permit") il.add_value( 'url', "http://www.claycountygov.com/about-us/local-government/public-records-search/permits" ) il.add_value('sourceName', 'FL_Clay_Building_Permits') return il
def save_csv(self, response, data_dic): il = ItemLoader(item=VaSosSpiderItem()) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'VA_SOS') il.add_value('url', 'http://www.scc.virginia.gov/clk/dwnld.aspx') il.add_value('permit_type', 'business_license') for k in data_dic: il.add_value(k, data_dic[k]) return il # def parse_row(self, response, row): # il = ItemLoader(item=VaSosSpiderItem()) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) # #il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) # il.add_value('sourceName', 'va_sos') # il.add_value('url', 'http://www.scc.virginia.gov/clk/dwnld.aspx') # il.add_value('type', row['Type']) # il.add_value('entity_id', row['EntityID']) # il.add_value('company_name', row['Name']) # il.add_value('dba_name', row['']) # il.add_value('status', row['Status']) # il.add_value('statusdate', row['StatusDate']) # il.add_value('duration', row['Duration']) # il.add_value('creation_date', row['IncorpDate']) # il.add_value('incorpstate', row['IncorpState']) # il.add_value('industrycode', row['IndustryCode']) # il.add_value('location_address_string', row['Street1+street2+city+state+zip']) # il.add_value('prinoffeffdate', row['PrinOffEffDate']) # il.add_value('mixed_name', row['RA-Name']) # il.add_value('mixed_subtype', row['']) # il.add_value('person_address_string', row['RA-Street1+street2+city+state+zip']) # il.add_value('ra-effdate', row['RA-EffDate']) # il.add_value('ra-status', row['RA-Status']) # il.add_value('ra-loc', row['RA-Loc']) # il.add_value('stockind', row['StockInd']) # il.add_value('totalshares', row['TotalShares']) # il.add_value('mergerind', row['MergerInd']) # il.add_value('assessind', row['AssessInd']) # il.add_value('stock', row['Stock']) # il.add_value('person_name', row['Officer Name']) # il.add_value('person_subtype', row['Officer Title']) # il.add_value('permit_type', row['']) # return il.load_item()
def save_to_csv(self,response,**det_dic): il = ItemLoader(item=AlForesterLicensesSpiderItem(),response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'AL_Forester_Licenses') il.add_value('url', 'http://asbrf.alabama.gov/vs2k5/rosterofforesters.aspx') il.add_value('permit_type', 'forester_license') # il.add_value('location_address_string', "AL") il.add_value('location_address_string', str(det_dic['person_addrs'])) il.add_value('county', str(det_dic['person_country'])) il.add_value('company_email', det_dic['person_mail_id']) il.add_value('person_subtype', det_dic['person_subtype']) il.add_value('permit_lic_no', det_dic['person_lic_num']) il.add_value('person_name', det_dic['user_name']) il.add_value('permit_lic_desc', det_dic['permit_lic_desc']) il.add_value('dba_name', det_dic['dba_name']) il.add_value('company_name', det_dic['comny_name']) il.add_value('company_phone', det_dic['person_phone_num']) return il
def __init__(self, settings, file_name, delimiter, fields_to_export, null_header, customHeader=False, topHeader=None): self.settings = settings self.file_name = file_name self.delimiter = delimiter self.fields_to_export = fields_to_export self.customHeader = customHeader self.chunk_folder = "chunk_{}".format(Utils.getingestion_timestamp()) self.topHeader = topHeader self.null_header = null_header self.items = [] self.chunk_number = 0 self.job_dir = settings.get('JOB_DIR_PAUSE_RESUME') self.appendMode = False
def save_to_csv(self, response, **meta): il = ItemLoader(item=WaWhatcomBellinghamBuildingPermitsSpiderItem()) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'WA_Whatcom_Bellingham_Building_Permits') il.add_value('url', 'https://www.cob.org/epermits/Search/permit.aspx') il.add_value('permit_lic_no', meta['record_number']) il.add_value('permit_subtype', meta['permit_lic_type']) il.add_value('subtype', meta['permit_subtype']) il.add_value('property type', meta["property_type"]) if meta["permit_lic_desc"]: meta["permit_lic_desc"] = meta["permit_lic_desc"] else: meta["permit_lic_desc"] = 'Building Permit' il.add_value('permit_lic_desc', meta["permit_lic_desc"]) il.add_value('Status', meta["permit_lic_status"]) il.add_value('permit_applied_date', meta["permit_applied_date"]) il.add_value('approved date', meta["approved_date"]) il.add_value('permit_lic_eff_date', meta["permit_lic_eff_date"]) il.add_value('finaled date', meta["finaled_date"]) il.add_value('permit_lic_exp_date', meta["permit_lic_exp_date"]) il.add_value('location_address_string', meta['address']) il.add_value('apn/pin', meta["apn_pin"]) il.add_value('parcel #', meta['parcel_number']) il.add_value('permit_lic_fee', meta['permit_lic_fee']) il.add_value('mixed_name', self._getDBA(meta['mixed_name'])[0]) il.add_value('dba_name', self._getDBA(meta['mixed_name'])[1]) il.add_value('mixed_subtype', meta["mixed_subtype"]) il.add_value('person_address_string', meta["person_address_string"]) il.add_value('mixed_contractor_name', self._getDBA(meta['mixed_contractor_name'])[0]) il.add_value('contractor_dba', self._getDBA(meta['mixed_contractor_name'])[1]) il.add_value('contractor_address_string', meta["contractor_address_string"]) il.add_value('inspection_subtype', meta["inspection_subtype"]) il.add_value('inspection_date', meta["completed_date"]) il.add_value('inspection_pass_fail', meta["inspection_pass_fail"]) il.add_value('inspection_type', meta["inspection_type"]) il.add_value('permit_type', 'building_permit') return il.load_item()
def __createChunkFile(self, spider): remove_spec = lambda x: ''.join(e for e in x if e.isalnum()) if self.file_name: l = list(os.path.splitext(self.file_name)) if self.chunk_number != 0: l.insert(1, "_file_{}".format(str(self.chunk_number))) if hasattr(spider, 'start') and spider.start: if self.chunk_number != 0: l.insert( 2, "_{}_{}".format(remove_spec(spider.start), remove_spec(spider.end))) else: l.insert( 1, "_{}_{}".format(remove_spec(spider.start), remove_spec(spider.end))) file_name = "".join(l) if self.appendMode: outpath = os.path.join( self.settings.get('STORAGE_DIR'), self.settings.get('JIRA_ID'), 'resume_{}'.format(Utils.getingestion_timestamp()), file_name if self.file_name else '{}_file_{}.csv'.format( spider.name, str(self.chunk_number))) else: outpath = os.path.join( self.settings.get('STORAGE_DIR'), self.settings.get('JIRA_ID'), file_name if self.file_name else '{}_file_{}.csv'.format( spider.name, str(self.chunk_number))) self.createFolder(outpath) self.file = open(outpath, 'w+b') kwargs = {'delimiter': self.delimiter} if self.fields_to_export: kwargs['fields_to_export'] = self.fields_to_export if self.null_header: kwargs['null_header'] = self.null_header self.exporter = CustomCsvItemExporter(self.file, **kwargs) self.exporter.start_exporting() if self.customHeader: fields = self.fields_to_export values = [self.topHeader.get(i) for i in fields] self.exporter.csv_writer.writerow(values)
def parse_row(self, response, row): self.logger.info("started to extracting CSV data from {}".format( response.url)) il = ItemLoader(item=NyAlbanyStateItems()) lat, lng = map(str, row['Location'].splitlines()[-1].strip('()').split(',')) il.add_value('permit_lic_no', row['Permit Number']) il.add_value('permit_lic_eff_date', row['Date']) il.add_value('application_number', row['Application Number']) il.add_value('location_address_string', row['Address']) il.add_value('person_name', row['Owner']) il.add_value('person_subtype', "Owner") il.add_value('contractor_name', row['Contractor']) il.add_value('permit_lic_value', row['Estimated Cost']) il.add_value('permit_lic_fee', row['Fee']) il.add_value('permit_lic_desc', row['Description of Work']) il.add_value('longitude', lng) il.add_value('latitude', lat) il.add_value('permit_type', "building_permits") il.add_value('url', response.url) il.add_value('sourceName', "NY_Albany_Building_Permits") il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) return il.load_item()
def save_to_csv(self, response, **data_pass): if data_pass['permit_lic_desc'] == '' or data_pass[ 'permit_lic_desc'] == None: data_pass['permit_lic_desc'] = 'Building Permit' il = ItemLoader(item=WiDouglasBuildingPermitsSpiderItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value( 'url', 'https://gcs.douglascountywi.org/gcswebportal/search.aspx') il.add_value('sourceName', 'WI_Douglas_Building_Permits') il.add_value('inspector_comments', data_pass['inspector_comments']) il.add_value('mixed_name', data_pass['mixed_name']) il.add_value('permit_subtype', data_pass['permit_subtype']) il.add_value('permit_lic_desc', data_pass['permit_lic_desc']) il.add_value('mixed_subtype', data_pass['mixed_subtype']) il.add_value('permit_type', data_pass['permit_type']) il.add_value('permit_lic_fee', data_pass['permit_lic_fee']) il.add_value('inspection_pass_fail', data_pass['inspection_pass_fail']) il.add_value('permit_lic_status', data_pass['permit_lic_status']) il.add_value('location_address_string', data_pass['location_address_string']) il.add_value('dba_name', data_pass['dba_name']) il.add_value('inspection_subtype', data_pass['inspection_subtype']) il.add_value('permit_lic_eff_date', data_pass['permit_lic_eff_date']) il.add_value('permit_lic_no', data_pass['permit_lic_no']) il.add_value('prop type', data_pass['prop type']) il.add_value('inspection_date', data_pass['inspection_date']) il.add_value('inspection_type', data_pass['inspection_type']) il.add_value('person', data_pass['person']) il.add_value('municipality', data_pass['municipality']) il.add_value('issue #', data_pass['issue #']) il.add_value('parcel number', data_pass['parcel number']) il.add_value('mail_address_string', data_pass['mail_address_string']) return il.load_item()