class IlAsbestosWorkerLicensesSpider(ExcelFeedSpider,DataFormatterMixin,LookupDatareaderMixin): name = '1392_il_asbestos_worker_licenses' allowed_domains = ['illinois.gov'] start_urls = ['https://data.illinois.gov/dataset/378idph_asbestos_licensed_workers/resource/f3266216-1c0e-4326-acb7-0f4341d1b463'] custom_settings = { 'FILE_NAME':Utils.getRundateFileName('AI-1392_Licenses_Asbestos_Worker_IL_CurationReady'), 'JIRA_ID':'AI_1392', 'COOKIES_ENABLED':True, 'DOWNLOAD_DELAY':3, 'COOKIES_DEBUG':True, 'HTTPCACHE_ENABLED':False, # 'JOBDIR' : CustomSettings.getJobDirectory('il_asbestos_worker_licenses'), 'TOP_HEADER':{'county': 'County', 'dba_name': '', 'permit_lic_desc': '', 'person_phone': 'Expr2', 'permit_lic_no': 'lic_id_number', 'permit_type': '', 'person_address_string': 'Address', 'person_name': 'tech_name + Last_name'}, 'FIELDS_TO_EXPORT':['permit_lic_no','person_name', 'dba_name', 'person_address_string', 'county', 'person_phone', 'permit_lic_desc','permit_type','sourceName', 'url', 'ingestion_timestamp'], 'NULL_HEADERS':['county'] } def parse(self, response): urls="https://data.illinois.gov/dataset/1ba86906-adb7-40db-a893-5ca97f09942e/resource/f3266216-1c0e-4326-acb7-0f4341d1b463/download/sgroupseheh-asbesinternet-listingworker-internet-listing.xls" yield scrapy.Request(urls, callback= self.parse_excel, dont_filter=True,encoding='utf-8') def parse_row(self, response, row): il = ItemLoader(item=IlAsbestosWorkerLicensesSpiderItem()) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'IL_Asbestos_Worker_Licenses') il.add_value('url', 'https://data.illinois.gov/dataset/378idph_asbestos_licensed_workers/resource/f3266216-1c0e-4326-acb7-0f4341d1b463') il.add_value('person_address_string', self.format__address_4(row['Expr1'],row['tech_city'],row['tech_state'],row['tech_zip'])) il.add_value('person_name', row['tech_name']+' '+row['LAST_NAME']) il.add_value('permit_lic_desc', 'Asbestos Contractor License') il.add_value('dba_name', '') il.add_value('person_phone', row['Expr2']) il.add_value('county', row['COUNTY']) il.add_value('permit_lic_no', '0'+row['lic_id_number'] if len(row['lic_id_number'])<9 else row['lic_id_number'] ) il.add_value('permit_type', 'asbestos_contractor_license') yield il.load_item()
class IlHospitalLicensesSpider(ExcelFeedSpider,DataFormatterMixin,LookupDatareaderMixin): name = '1390_il_hospital_licenses' allowed_domains = ['illinois.gov'] start_urls = ['https://data.illinois.gov/dataset/410idph_hospital_directory/resource/9bdedb85-77f3-490a-9bbd-2f3f5f227981'] custom_settings = { 'FILE_NAME':Utils.getRundateFileName('AI-1390_Licenses_Hospital_IL_CurationReady'), 'JIRA_ID':'AI_1390', 'DOWNLOAD_DELAY':5, 'COOKIES_ENABLED':True, 'COOKIES_DEBUG':True, 'HTTPCACHE_ENABLED':False, # 'JOBDIR' : CustomSettings.getJobDirectory('il_hospital_licenses'), 'TOP_HEADER':{ 'company_name': 'Hospitals/End Stage Renal Disease/Pregnancy Termination Specialty Centers', 'company_phone': 'Phone', 'company_subtype': 'Type', 'county': 'County', 'dba_name': '', 'location_address_string': 'Address', 'permit_lic_desc': '', 'permit_lic_exp': 'Exp. Date', 'permit_lic_no': 'License #/Medicare #', 'permit_type': ''}, 'FIELDS_TO_EXPORT':[ 'company_name', 'dba_name','location_address_string','county','company_phone','permit_lic_no','company_subtype','permit_lic_exp','permit_lic_desc','permit_type','sourceName','url', 'ingestion_timestamp' ], 'NULL_HEADERS':['county'] } # Do any adaptations you need here #def adapt_response(self, response): # return response def parse(self, response): yield scrapy.Request('https://data.illinois.gov/dataset/a552f663-74a8-4722-a506-0619e9356062/resource/9bdedb85-77f3-490a-9bbd-2f3f5f227981/download/siqueryinterns-2018-2019illinois.govhospitals-march-2019.xls', callback= self.parse_excel, dont_filter=True,encoding='utf-8') def parse_row(self, response, row): print(row) il = ItemLoader(item=IlHospitalLicensesSpiderItem()) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('url', 'https://data.illinois.gov/dataset/410idph_hospital_directory/resource/9bdedb85-77f3-490a-9bbd-2f3f5f227981') il.add_value('sourceName', 'IL_Hospital_Licenses') il.add_value('permit_type', "medical_license") name=self._getDBA(row['Hospitals']) company_name=str(name[0]).replace(' -','') if ' -' in str(name[0]) else name[0] address=self.format__address_4(row['Address'],row['City'],'IL',row['Zipcode']) il.add_value('dba_name', name[1]) il.add_value('permit_lic_no', row.get('License #','')) il.add_value('permit_lic_exp', self.format_date(row.get('Exp. Date','')) if row.get('Exp. Date') else '') il.add_value('company_name', company_name) il.add_value('location_address_string', address) il.add_value('county', row.get('County','')) il.add_value('permit_lic_desc',"Medical License for "+company_name if name[0] else "Medical License") il.add_value('company_phone', row.get('Phone','')) il.add_value('company_subtype', row.get('Type','')) yield il.load_item()
class AlPodiatryLicensesSpider(ExcelFeedSpider,DataFormatterMixin,LookupDatareaderMixin): name = '1474_al_podiatry_licenses' allowed_domains = ['alabama.gov'] start_urls = ['http://www.podiatryboard.alabama.gov/licensees.aspx'] custom_settings = { 'FILE_NAME':Utils.getRundateFileName('AI-1474_Licenses_Podiatry_AL_CurationReady'), 'JIRA_ID':'AI_1474', # 'JOBDIR' : CustomSettings.getJobDirectory('AlPodiatryLicensesSpider'), 'TOP_HEADER':{'company_name': 'Practice Name','company_phone': 'Office Phone #','controlled substance license #': 'Controlled Substance License #','dba_name': '','location_address_string': 'Address','permit_lic_desc': '','permit_lic_eff_date': 'Effective Date','permit_lic_exp_date': 'Expiration Date','permit_lic_no': 'License #','permit_type': '','person_name': 'Name'}, 'FIELDS_TO_EXPORT':['permit_lic_no','controlled substance license #','person_name','permit_lic_exp_date','permit_lic_eff_date','company_name','dba_name','location_address_string','company_phone','permit_lic_desc','permit_type','url','sourceName','ingestion_timestamp', ], 'NULL_HEADERS':['controlled substance license #'] } def parse(self, response): extension = response.xpath('//*[@id="form1"]/div[4]/div[1]/div[3]/div[1]/h3/a/@href').extract_first() if extension: next_page_url ='http://www.podiatryboard.alabama.gov/'+extension yield scrapy.Request(url = next_page_url, callback=self.parse_excel, dont_filter=True) def parse_row(self, response, row): il = ItemLoader(item=AlPodiatryLicensesSpiderItem()) #il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'AL_Podiatry_Licenses') il.add_value('url', 'http://www.podiatryboard.alabama.gov/licensees.aspx') il.add_value('permit_type', 'podiatry_license') il.add_value('permit_lic_exp_date', self.format_date(row['Expiration Date'])) il.add_value('permit_lic_no', row['License #']) il.add_value('dba_name',self._getDBA(row['Practice Name'])[1]) company_name = self._getDBA(row['Practice Name'])[0] company_name = company_name if len(company_name)>1 else row['First Name']+' '+ row['Last Name'] il.add_value('permit_lic_desc', 'Podiatry License for '+ company_name if len(company_name)>1 else 'Podiatry License') location_address = self.format__address_4(row['Address'], row['City'], row['State'], row['Zip Code']) il.add_value('location_address_string', location_address if len(location_address)>2 else 'AL') il.add_value('person_name', row['First Name']+' '+ row['Last Name']) il.add_value('permit_lic_eff_date', self.format_date(row['Effective Date'])) il.add_value('controlled substance license #', row['Controlled Substance License #']) il.add_value('company_name', company_name) il.add_value('company_phone', row['Office Phone #']) return il.load_item()
class NvFuneralLicensesSpider(CommonSpider): name = 'nv_funeral_licenses' allowed_domains = ['nv.gov'] start_urls = ['http://funeral.nv.gov/Licensees/Licensees/'] custom_settings = { 'FILE_NAME':Utils.getRundateFileName('Licenses_Funeral_NV_CurationReady'), 'JIRA_ID':'AI_597', 'JOBDIR' : CustomSettings.getJobDirectory('NvFuneralLicensesSpider'), 'TOP_HEADER':{ 'company_name': 'Facility DBA Name', 'company_phone': 'Phone', 'location_address_string': 'Physical Address+City++State+Zip', 'permit_lic_eff_date': 'Date Beginning', 'permit_lic_exp_date': 'Date Ending', 'permit_lic_no': 'License#/ Permit#', 'permit_subtype': 'License Type', 'permit_type': '', 'person_name': 'First Name+Middle Name+Last Name'}, 'FIELDS_TO_EXPORT':['permit_lic_no', 'person_name', 'company_name', 'permit_subtype', 'location_address_string', 'permit_lic_eff_date','permit_lic_exp_date','company_phone', 'permit_type', 'sourceName', 'url', 'ingestion_timestamp'], 'NULL_HEADERS':[] } permit_lic_no=[] permit_lic_eff_date=[] permit_subtype=[] person_name=[] ingestion_timestamp=[] location_address_string=[] sourceName=[] company_phone=[] permit_type=[] permit_lic_exp_date=[] url=[] company_name=[] person_name1=[] person_name2=[] dob=[] doe=[] def parse(self, response): df = tabula.read_pdf('/home/ait-python/Desktop/583/1.pdf', pages = '1',delimeter=',', encoding='ISO-8859-1', area=(146.498,30.0,749.318,579.27), guess=False, pandas_options={'header': 'infer'}) for _, row in df.iterrows(): self.a=row.tolist() lic_no = str(self.a[0]) print("###################",lic_no) fname = str(self.a[2])+" "+str(self.a[3])+" "+str(self.a[4]) lname = fname.replace("nan","") lic_type = "Funeral Director License" daob = str(self.a[6]) daoe = str(self.a[7]) il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'NV_Funeral_Licenses') il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') il.add_value('permit_lic_no', lic_no) il.add_value('permit_lic_eff_date', daob) il.add_value('permit_subtype', lic_type) il.add_value('person_name', fname) il.add_value('location_address_string', 'NV') il.add_value('company_phone', '') il.add_value('permit_type', 'cemetery_funeral_license') il.add_value('permit_lic_exp_date', daoe) il.add_value('company_name', '') yield il.load_item() df2 = tabula.read_pdf('/home/ait-python/Downloads/pdf/Embalmers.pdf', pages = '2',delimeter=',', encoding='ISO-8859-1', area=(70.763,30.0,535.883,580.035), guess=False, pandas_options={'header': 'infer'}) for _, row in df2.iterrows(): self.b=row.tolist() lic_no = str(self.b[0])+str(self.b[1]).replace('nan','') fname = str(self.b[2])+" "+str(self.b[3])+" "+str(self.b[4]).replace('nan','') lname = fname.replace('nan','') lic_type = "Embalmer License" daob = str(self.b[6]) daoe = str(self.b[7]) il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'NV_Funeral_Licenses') il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') il.add_value('permit_lic_no', lic_no) il.add_value('permit_lic_eff_date', daob) il.add_value('permit_subtype', lic_type) il.add_value('person_name', lname) il.add_value('location_address_string', 'NV') il.add_value('company_phone', '') il.add_value('permit_type', 'cemetery_funeral_license') il.add_value('permit_lic_exp_date', daoe) il.add_value('company_name', '') yield il.load_item() # #----------------------------------------------------------------------------------2 # df3 = tabula.read_pdf('/home/ait-python/Downloads/pdf/FuneralArrangersLicensees.pdf', # pages = '1',delimeter=',', # encoding='ISO-8859-1', # area=(155.678,29.835,752.378,582.93), # guess=False, # pandas_options={'header': 'infer'}) # for _, row in df3.iterrows(): # self.c=row.tolist() # lic_no = str(self.c[0]).replace('nan','') # fname = str(self.c[1])+" "+str(self.c[2])+" "+str(self.c[3]) # lname = fname.replace('nan','') # lic_type = "Funeral Arranger License" # daob = str(self.c[5]).replace('nan','') # daoe = str(self.c[6]).replace('nan','') # il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) # il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) # il.add_value('sourceName', 'NV_Funeral_Licenses') # il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') # il.add_value('permit_lic_no', lic_no) # il.add_value('permit_lic_eff_date', daob) # il.add_value('permit_subtype', lic_type) # il.add_value('person_name', lname) # il.add_value('location_address_string', 'NV') # il.add_value('company_phone', '') # il.add_value('permit_type', 'cemetery_funeral_license') # il.add_value('permit_lic_exp_date', daoe) # il.add_value('company_name', '') # yield il.load_item() # df4 =tabula.read_pdf('/home/ait-python/Downloads/pdf/FuneralArrangersLicensees.pdf', # pages = '2',delimeter=',', # encoding='ISO-8859-1', # area=(60.818,30.0,767.678,583.86), # guess=False, # pandas_options={'header': 'infer'}) # for _, row in df4.iterrows(): # self.d=row.tolist() # lic_no = str(self.d[0]).replace('nan','') # fname = str(self.d[1])+" "+ str(self.d[2])+" "+str(self.d[3]) # lname = fname.replace('nan','') # lic_type = "Funeral Arranger License" # daob = str(self.d[5]) # daoe = str(self.d[6]).replace('nan','') # il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) # il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) # il.add_value('sourceName', 'NV_Funeral_Licenses') # il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') # il.add_value('permit_lic_no', lic_no) # il.add_value('permit_lic_eff_date', daob) # il.add_value('permit_subtype', lic_type) # il.add_value('person_name', lname) # il.add_value('location_address_string', 'NV') # il.add_value('company_phone', '') # il.add_value('permit_type', 'cemetery_funeral_license') # il.add_value('permit_lic_exp_date', daoe) # il.add_value('company_name', '') # yield il.load_item() # df5 = tabula.read_pdf('/home/ait-python/Downloads/pdf/FuneralArrangersLicensees.pdf', # pages = '3',delimeter=',', # encoding='ISO-8859-1', # area=(60.818,30.0,393.593,580.035), # guess=False, # pandas_options={'header': 'infer'}) # for _, row in df5.iterrows(): # self.e = row.tolist() # lic_no = str(self.e[0]).replace('nan','') # fname = str(self.e[1])+" "+ str(self.e[2])+" "+str(self.e[3]) # lname = fname.replace('nan','') # lic_type = "Funeral Arranger License" # daob = str(self.e[5]) # daoe = str(self.e[6]) # il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) # il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) # il.add_value('sourceName', 'NV_Funeral_Licenses') # il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') # il.add_value('permit_lic_no', lic_no) # il.add_value('permit_lic_eff_date', daob) # il.add_value('permit_subtype', lic_type) # il.add_value('person_name', lname) # il.add_value('location_address_string', 'NV') # il.add_value('company_phone', '') # il.add_value('permit_type', 'cemetery_funeral_license') # il.add_value('permit_lic_exp_date', daoe) # il.add_value('company_name', '') # yield il.load_item() # # #-----------------------------------------------------------------------------------------------------------------------------------3 # df6 = tabula.read_pdf('/home/ait-python/Downloads/pdf/Funeral-Directors.pdf', # pages = '1',delimeter=',', # encoding='ISO-8859-1', # area=(146.498,30.0,763.853,580.035), # guess=False, # pandas_options={'header': 'infer'}) # for _, row in df6.iterrows(): # self.f = row.tolist() # lic_no = str(self.f[0]).replace('nan','') # fname = str(self.f[1])+" "+ str(self.f[2])+" "+str(self.f[3]) # lname = fname.replace('nan','') # lic_type = "Funeral Director License" # daob = str(self.f[5]) # daoe = str(self.f[6]) # il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) # il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) # il.add_value('sourceName', 'NV_Funeral_Licenses') # il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') # il.add_value('permit_lic_no', lic_no) # il.add_value('permit_lic_eff_date', daob) # il.add_value('permit_subtype', lic_type) # il.add_value('person_name', lname) # il.add_value('location_address_string', 'NV') # il.add_value('company_phone', '') # il.add_value('permit_type', 'cemetery_funeral_license') # il.add_value('permit_lic_exp_date', daoe) # il.add_value('company_name', '') # yield il.load_item() # df7 = tabula.read_pdf('/home/ait-python/Downloads/pdf/Funeral-Directors.pdf', # pages = '2,3,4',delimeter=',', # encoding='ISO-8859-1', # area=(60.818,30.0,751.613,580.035), # guess=False, # pandas_options={'header': 'infer'}) # for _, row in df7.iterrows(): # self.g = row.tolist() # lic_no = str(self.g[0]).replace('nan','') # fname = str(self.g[1])+" "+ str(self.g[2])+" "+str(self.g[3]) # lname = fname.replace('nan','') # lic_type = "Funeral Director License" # daob = str(self.g[5]) # daoe = str(self.g[6]) # # print("^^^^^^^^^^^^^^^^^",daoe) # il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) # il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) # il.add_value('sourceName', 'NV_Funeral_Licenses') # il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') # il.add_value('permit_lic_no', lic_no) # il.add_value('permit_lic_eff_date', daob) # il.add_value('permit_subtype', lic_type) # il.add_value('person_name', lname) # il.add_value('location_address_string', 'NV') # il.add_value('company_phone', '') # il.add_value('permit_type', 'cemetery_funeral_license') # il.add_value('permit_lic_exp_date', daoe) # il.add_value('company_name', '') # yield il.load_item() # #----------------------------------------------------- # # df7 = tabula.read_pdf('/home/ait-python/Downloads/pdf/Funeral-Directors.pdf', # # pages = '3', # # encoding='ISO-8859-1', # # area=(60.818,30.0,751.613,580.035), # # guess=False, # # pandas_options={'header': 'infer'}) # # for _, row in df7.iterrows(): # # self.g = row.tolist() # # lic_no = str(self.g[0]).replace('nan','') # # fname = str(self.g[1])+" "+ str(self.g[2])+" "+str(self.g[3]) # # lname = fname.replace('nan','') # # lic_type = "Funeral Director License" # # daob = str(self.g[5]) # # daoe = str(self.g[6]) # # # print("@@@@@@@@@@@@@@@@",daoe) # # df8 = tabula.read_pdf('/home/ait-python/Downloads/pdf/Funeral-Directors.pdf', # # pages = '4', # # encoding='ISO-8859-1', # # area=(60.818,30.0,751.613,580.035), # # guess=False, # # pandas_options={'header': 'infer'}) # # for _, row in df8.iterrows(): # # self.h = row.tolist() # # lic_no = str(self.h[0]).replace('nan','') # # print("!!!!!!!!!!!!!!!!!!1",lic_no) # # # fname = str(self.h[1])+" "+ str(self.h[2])+" "+str(self.h[3]) # # # lname = fname.replace('nan','') # # # lic_type = "Funeral Director License" # # # daob = str(self.h[5]) # # # daoe = str(self.h[6]) # #-----------------------------------------------------------------------------------------4 # df9 = tabula.read_pdf('/home/ait-python/Downloads/pdf/FuneralEstablishmentsAndDirectCremationFacilities.pdf', # pages = '1',delimeter=',', # encoding='ISO-8859-1', # area=(163.845,23.76,574.695,768.24), # guess=False, # pandas_options={'header': 'infer'}) # for _, row in df9.iterrows(): # self.i = row.tolist() # lic_no = str(self.i[0]).replace('nan','') # cname = str(self.i[1]) # ad = str(self.i[2])+", "+str(self.i[3])+", "+"NV "+str(self.i[4]) # addr = ad.replace("nan","") # print("#@@#@#@#@#@#@#@#@#@#",addr) # phone = str(self.i[5]).replace("nan","") # lic_type = str(self.i[6]).replace("nan","") # il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) # il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) # il.add_value('sourceName', 'NV_Funeral_Licenses') # il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') # il.add_value('permit_lic_no', lic_no) # il.add_value('permit_lic_eff_date','') # il.add_value('permit_subtype', lic_type) # il.add_value('person_name', '') # il.add_value('location_address_string', addr) # il.add_value('company_phone', phone) # il.add_value('permit_type', 'cemetery_funeral_license') # il.add_value('permit_lic_exp_date', '') # il.add_value('company_name', cname) # yield il.load_item() # df10 = tabula.read_pdf('/home/ait-python/Downloads/pdf/FuneralEstablishmentsAndDirectCremationFacilities.pdf', # pages = '2',delimeter=',', # encoding='ISO-8859-1', # area=(68.805,30.0,576.675,762.6), # guess=False, # pandas_options={'header': 'infer'}) # for _, row in df10.iterrows(): # self.j = row.tolist() # lic_no = str(self.j[0]).replace('nan','') # cname = str(self.j[1]) # ad = str(self.j[2])+", "+str(self.j[3])+", "+"NV "+str(self.j[4]) # addr = ad.replace("nan","") # phone = str(self.j[5]).replace("nan","") # lic_type = str(self.j[6]).replace("nan","") # print("#####################",lic_type) # il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) # il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) # il.add_value('sourceName', 'NV_Funeral_Licenses') # il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') # il.add_value('permit_lic_no', lic_no) # il.add_value('permit_lic_eff_date','') # il.add_value('permit_subtype', lic_type) # il.add_value('person_name', '') # il.add_value('location_address_string', addr) # il.add_value('company_phone', phone) # il.add_value('permit_type', 'cemetery_funeral_license') # il.add_value('permit_lic_exp_date', '') # il.add_value('company_name', cname) # yield il.load_item() # df11 =tabula.read_pdf('/home/ait-python/Downloads/pdf/FuneralEstablishmentsAndDirectCremationFacilities.pdf', # pages = '3',delimeter=',', # encoding='ISO-8859-1', # area=(68.805,30.0,576.675,762.6), # guess=False, # pandas_options={'header': 'infer'}) # for _, row in df11.iterrows(): # self.k = row.tolist() # lic_no = str(self.k[0]).replace('nan','') # cname = str(self.k[1]) # ad = str(self.k[2])+", "+str(self.k[3])+", "+"NV "+str(self.k[4]) # addr = ad.replace("nan","") # phone = str(self.k[5]).replace("nan","") # lic_type = str(self.k[6]).replace("nan","") # print("#####################",lic_type) # il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) # il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) # il.add_value('sourceName', 'NV_Funeral_Licenses') # il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') # il.add_value('permit_lic_no', lic_no) # il.add_value('permit_lic_eff_date','') # il.add_value('permit_subtype', lic_type) # il.add_value('person_name', '') # il.add_value('location_address_string', addr) # il.add_value('company_phone', phone) # il.add_value('permit_type', 'cemetery_funeral_license') # il.add_value('permit_lic_exp_date', '') # il.add_value('company_name', cname) # yield il.load_item() # self.state['items_count'] = self.state.get('items_count', 0) + 1 # il = ItemLoader(item=NvFuneralLicensesSpiderItem(),response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) # #il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) # il.add_value('sourceName', 'nv_funeral_licenses') # il.add_value('url', 'http://funeral.nv.gov/Licensees/Licensees/') # il.add_xpath('permit_lic_no', '') # il.add_xpath('permit_lic_eff_date', '') # il.add_xpath('permit_subtype', '') # il.add_xpath('person_name', '') # il.add_xpath('location_address_string', '') # il.add_xpath('company_phone', '') # il.add_xpath('permit_type', '') # il.add_xpath('permit_lic_exp_date', '') # il.add_xpath('company_name', '') # return il.load_item()
class MsElectronicProtectionLicensesSpider(CommonSpider): name = '1498_ms_electronic_protection_licenses' allowed_domains = ['ms.gov'] start_urls = [ 'https://www.mid.ms.gov/sfm/mississippi-electronic-protection-systems.aspx#Licensing%2520Search' ] custom_settings = { 'FILE_NAME': Utils.getRundateFileName( 'AI-1498_Licenses_Electronic_Protection_MS_CurationReady'), 'JIRA_ID': 'AI_1498', 'HTTPCACHE_ENABLED': False, 'COOKIES_ENABLED': True, 'COOKIES_DEBUG': True, # 'JOBDIR' : CustomSettings.getJobDirectory('ms_electronic_protection_licenses'), 'TOP_HEADER': { ' nat. prod. id': 'Nat. Prod. ID', 'company_name': 'Name', 'company_phone': 'Phone', 'dba_name': '', 'location_address_string': 'Physical Address', 'permit_lic_desc': '', 'permit_lic_exp': 'Exp. Date', 'permit_lic_no': 'License Number', 'permit_subtype': 'Type', 'permit_type': '', 'person_name': 'First+Middle+Last name' }, 'FIELDS_TO_EXPORT': [ 'permit_subtype', 'permit_lic_no', 'company_name', 'dba_name', 'person_name', ' nat. prod. id', 'location_address_string', 'company_phone', 'permit_lic_exp', 'permit_lic_desc', 'permit_type', 'url', 'ingestion_timestamp', 'sourceName' ], 'NULL_HEADERS': ['nat. prod. id'] } def parse(self, response): table_data = response.xpath( '//*[@id="maincontent_pagecontent_rbls"]//tr/td//text()').extract( ) tb_list = [ 'Class A - Contracting Company', 'REPT', 'REPH', 'REPC', 'REPD', 'REPB' ] for i, j in zip(table_data, tb_list): form_data = { 'ctl00$ctl00$maincontent$pagecontent$ToolkitScriptManager1': 'ctl00$ctl00$maincontent$pagecontent$UpdatePanel1|ctl00$ctl00$maincontent$pagecontent$ButtonSubmit', '_TSM_HiddenField_': response.xpath( "//input[@id='_TSM_HiddenField_']/@value").extract_first(), 'ctl00$ctl00$maincontent$pagecontent$rbls': str(j), '__EVENTTARGET': '', '__EVENTARGUMENT': '', '__LASTFOCUS': '', '__VIEWSTATE': response.xpath( "//input[@id='__VIEWSTATE']/@value").extract_first(), '__VIEWSTATEGENERATOR': response.xpath("//input[@id='__VIEWSTATEGENERATOR']/@value"). extract_first(), '__EVENTVALIDATION': response.xpath( "//input[@id='__EVENTVALIDATION']/@value").extract_first(), '__ASYNCPOST': 'true', 'ctl00$ctl00$maincontent$pagecontent$ButtonSubmit': 'Submit', } # print("===================================",str(j),str(i)) yield scrapy.FormRequest(response.url, callback=self.detail_page, dont_filter=True, method='POST', formdata=form_data, meta={ 'page': 1, 'permit_lic_desc': str(i), 'option': str(j) }) def detail_page(self, response): meta = {} meta = response.meta res = '' meta['permit_lic_desc'] = meta['permit_lic_no'] = meta[ 'company_name'] = meta['person_name'] = meta['nat_prof_id'] = meta[ 'location_address_string'] = meta['phone'] = meta[ 'exp_date'] = meta['permit_lic_desc'] = meta[ 'dba_name'] = '' if isinstance(response, HtmlResponse): res = response else: res = HtmlResponse( 'https://attorneyinfo.aoc.arkansas.gov/info/attorney_search/info/attorney/attorneysearch.aspx?', body=str.encode(response.text)) viewstate = response.text.split('__VIEWSTATE|')[1].split('|')[0] __VIEWSTATEGENERATOR = response.text.split( '__VIEWSTATEGENERATOR|')[1].split('|')[0] tsm = '2GFwlGU9ATlFIxrdsXRzcja58_1t5F8HSleaZM4ZQwk1' __EVENTVALIDATION = response.text.split( '__EVENTVALIDATION|')[1].split('|')[0] # print("============================================================",meta['option']) if meta['option'] == 'Class A - Contracting Company': table_data = res.xpath( '//*[@id="maincontent_pagecontent_DataGrid4"]//tr') for i in table_data[1:-1]: meta['permit_lic_no'] = i.xpath('td[1]/text()').extract_first() company_name = i.xpath('td[2]/text()').extract_first() meta['company_name'] = self._getDBA(company_name)[0] meta['dba_name'] = self._getDBA(company_name)[1] add1 = i.xpath('td[3]/text()').extract_first() add2 = i.xpath('td[4]/text()').extract_first() add3 = i.xpath('td[5]/text()').extract_first() city = i.xpath('td[6]/text()').extract_first() state = i.xpath('td[7]/text()').extract_first() code = i.xpath('td[8]/text()').extract_first() print("------------------------------", meta['company_name']) meta['location_address_string'] = self.format__address6( add1, add2, add3, city, state, code).replace(' ,', '') meta['phone'] = i.xpath('td[9]/text()').extract_first() meta['permit_lic_desc'] = 'Company' yield self.save_to_csv(response, **meta).load_item() next_page = res.xpath( '//*[@id="maincontent_pagecontent_DataGrid4"]//tr/td/span[contains(text(), "' + str(meta['page']) + '")]/following::a/@href').extract_first() if next_page: datas = JavaScriptUtils.getValuesFromdoPost(next_page) form_data = { 'ctl00$ctl00$maincontent$pagecontent$ToolkitScriptManager1': 'ctl00$ctl00$maincontent$pagecontent$UpdatePanel1|' + datas['__EVENTTARGET'], '_TSM_HiddenField_': tsm, 'ctl00$ctl00$maincontent$pagecontent$rbls': 'Class A - Contracting Company', '__EVENTTARGET': datas['__EVENTTARGET'], '__EVENTARGUMENT': datas['__EVENTARGUMENT'], '__LASTFOCUS': '', '__VIEWSTATE': viewstate, '__VIEWSTATEGENERATOR': __VIEWSTATEGENERATOR, '__EVENTVALIDATION': __EVENTVALIDATION, '__ASYNCPOST': 'true' } yield scrapy.FormRequest(response.url, callback=self.detail_page, dont_filter=True, method='POST', formdata=form_data, meta={ 'page': meta['page'] + 1, 'option': meta['option'] }) if meta['option'] == 'REPH' or meta['option'] == 'REPC' or meta[ 'option'] == 'REPD' or meta['option'] == 'REPB': table_data = res.xpath( '//*[@id="maincontent_pagecontent_DataGrid1"]//tr') for i in table_data[1:-1]: meta['permit_lic_no'] = i.xpath( 'td[1]/a/text()').extract_first() meta['nat_prof_id'] = i.xpath('td[2]/text()').extract_first() f_name = i.xpath('td[3]/text()').extract_first() l_name = i.xpath('td[4]/text()').extract_first() m_name = i.xpath('td[5]/text()').extract_first() s_name = i.xpath('td[6]/text()').extract_first() meta[ 'person_name'] = f_name + ' ' + m_name + ' ' + l_name + ' ' + s_name add1 = i.xpath('td[7]/text()').extract_first() add2 = i.xpath('td[8]/text()').extract_first() city = i.xpath('td[9]/text()').extract_first() state = i.xpath('td[10]/text()').extract_first() code = i.xpath('td[11]/text()').extract_first() meta['location_address_string'] = self.format__address5( add1, add2, city, state, code) meta['phone'] = i.xpath('td[12]/text()').extract_first() meta['exp_date'] = i.xpath('td[13]/text()').extract_first() meta['company_name'] = meta['person_name'] if meta['option'] == 'REPH': meta['permit_lic_desc'] = 'Helper' if meta['option'] == 'REPC': meta['permit_lic_desc'] = 'Installer' if meta['option'] == 'REPD': meta['permit_lic_desc'] = 'Salesperson' if meta['option'] == 'REPB': meta['permit_lic_desc'] = 'Technician' yield self.save_to_csv(response, **meta).load_item() next_page = res.xpath( '//*[@id="maincontent_pagecontent_DataGrid1"]//tr/td/span[contains(text(), "' + str(meta['page']) + '")]/following::a/@href').extract_first() if next_page: datas = JavaScriptUtils.getValuesFromdoPost(next_page) form_data = { 'ctl00$ctl00$maincontent$pagecontent$ToolkitScriptManager1': 'ctl00$ctl00$maincontent$pagecontent$UpdatePanel1|' + datas['__EVENTTARGET'], '_TSM_HiddenField_': tsm, 'ctl00$ctl00$maincontent$pagecontent$rbls': meta['option'], '__EVENTTARGET': datas['__EVENTTARGET'], '__EVENTARGUMENT': datas['__EVENTARGUMENT'], '__LASTFOCUS': '', '__VIEWSTATE': viewstate, '__VIEWSTATEGENERATOR': __VIEWSTATEGENERATOR, '__EVENTVALIDATION': __EVENTVALIDATION, '__ASYNCPOST': 'true' } yield scrapy.FormRequest(response.url, callback=self.detail_page, dont_filter=True, method='POST', formdata=form_data, meta={ 'page': meta['page'] + 1, 'option': meta['option'] }) def save_to_csv(self, response, **meta): # self.state['items_count'] = self.state.get('items_count', 0) + 1 il = ItemLoader(item=MsElectronicProtectionLicensesSpiderItem(), response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) #il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value( 'url', 'https://www.mid.ms.gov/sfm/mississippi-electronic-protection-systems.aspx#Licensing%2520Search' ) il.add_value('sourceName', 'MS_Electronic_Protection_Licenses') il.add_value('permit_subtype', meta['permit_lic_desc']) il.add_value('permit_lic_no', meta['permit_lic_no']) il.add_value('company_name', meta['company_name']) il.add_value('dba_name', meta['dba_name']) il.add_value('person_name', meta['person_name']) il.add_value(' nat. prod. id', meta['nat_prof_id']) il.add_value('location_address_string', meta['location_address_string']) il.add_value('company_phone', meta['phone']) il.add_value('permit_lic_exp', meta['exp_date']) il.add_value('permit_lic_desc', meta['permit_lic_desc']) il.add_value('permit_type', 'electronics_license') return il
class WiDouglasBuildingPermitsSpider(CommonSpider): name = '257_wi_douglas_building_permits' allowed_domains = ['douglascountywi.org'] start_urls = ['https://gcs.douglascountywi.org/gcswebportal/search.aspx'] custom_settings = { 'FILE_NAME': Utils.getRundateFileName( 'AI-257_Permits_Buildings_WI_Dougles_CurationReady'), 'JIRA_ID': 'AI_257', 'CONCURRENT_REQUESTS': 1, # 'DOWNLOAD_DELAY':1, # 'JOBDIR' : CustomSettings.getJobDirectory('WiTrempealeauBuildingPermitsSpider'), 'HTTPCACHE_ENABLED': False, 'COOKIES_ENABLED': True, 'COOKIES_DEBUG': True, 'TOP_HEADER': { 'dba_name': '', 'inspection_date': 'Date', 'inspection_pass_fail': 'Status.1', 'inspection_subtype': 'Name', 'inspection_type': '', 'inspector_comments': 'Comments', 'issue #': 'Issue #', 'location_address_string': 'Property Address+ WI', 'mail_address_string': 'Billing Address', 'mixed_name': 'Owner ', 'mixed_subtype': '', 'municipality': 'Municipality', 'parcel number': 'Parcel Number', 'permit_lic_desc': '', 'permit_lic_eff_date': 'Issue Date', 'permit_lic_fee': 'Fee Total', 'permit_lic_no': 'Application #', 'permit_lic_status': 'Status', 'permit_subtype': 'Type', 'permit_type': '', 'person': 'Person', 'prop type': 'Prop Type' }, 'FIELDS_TO_EXPORT': [ 'prop type', 'parcel number', 'municipality', 'location_address_string', 'mixed_name', 'dba_name', 'mixed_subtype', 'mail_address_string', 'permit_subtype', 'permit_lic_no', 'issue #', 'permit_lic_eff_date', 'permit_lic_status', 'permit_lic_fee', 'inspection_type', 'inspection_date', 'inspection_subtype', 'inspection_pass_fail', 'person', 'inspector_comments', 'permit_lic_desc', 'permit_type', 'sourceName', 'url', 'ingestion_timestamp' ], 'NULL_HEADERS': ['person', 'prop type', 'issue #', 'municipality', 'parcel number'] } value = True alpha1 = [] alpha = [] def __init__(self, start=None, end=None, startnum=None, endnum=None, proxyserver=None, *a, **kw): super(WiDouglasBuildingPermitsSpider, self).__init__(start, end, proxyserver=None, *a, **kw) import csv import os current_file_path = os.path.abspath(os.path.dirname( __file__)) + '/AI_257_permit_no_list_{}_{}.csv'.format( self.start, self.end) self.csv = open(current_file_path, "w") columnTitleRow = "parcel_no\n" def parse(self, response): form_data = { '__EVENTTARGET': 'ctl00$cphMainApp$LinkButtonPermitTab', '__VIEWSTATE': response.xpath( "//input[@id='__VIEWSTATE']/@value").extract_first(), '__EVENTVALIDATION': response.xpath( "//input[@id='__EVENTVALIDATION']/@value").extract_first(), '__VIEWSTATEGENERATOR': 'DCE30F85' } yield scrapy.FormRequest( url='https://gcs.douglascountywi.org/gcswebportal/search.aspx', callback=self.parse_things, dont_filter=True, formdata=form_data) def parse_things(self, response): if self.value: self.search_element = SearchCriteria.strRange(self.start, self.end) self.value = False # print ('@@@@@@@@@22',self.search_element) if len(self.search_element) > 0: val = self.search_element.pop(0) form_data1 = { 'ctl00$cphMainApp$ToolkitScriptManager1': 'ctl00$cphMainApp$upSearch|ctl00$cphMainApp$ButtonPermitSearch', '__VIEWSTATEGENERATOR': 'DCE30F85', '__VIEWSTATE': response.xpath( "//input[@id='__VIEWSTATE']/@value").extract_first(), '__EVENTVALIDATION': response.xpath( "//input[@id='__EVENTVALIDATION']/@value").extract_first(), 'ctl00$cphMainApp$PermitSearchCriteria1$TextBoxRefLastName': str(val), '__ASYNCPOST': 'true', 'ctl00$cphMainApp$ButtonPermitSearch': 'Search For Permits' } yield scrapy.FormRequest( url='https://gcs.douglascountywi.org/gcswebportal/search.aspx', callback=self.parse_second, dont_filter=True, formdata=form_data1) @inline_requests def parse_second(self, response): if 'Your search returned no results.' in response.text: pass else: vals = response.xpath( '//*[@id="ctl00_cphMainApp_GridViewPermitResults"]//tr/td[2]/a/@href' ).extract() for v in range(3, len(vals) + 2): parcelss_noo = str( response.xpath( '//*[@id="ctl00_cphMainApp_GridViewPermitResults"]//tr[' + str(v) + ']/td[5]/a/text()').extract_first()).strip().replace( 'None', '') if parcelss_noo == '' or parcelss_noo == None: linkk = str( response.xpath( '//*[@id="ctl00_cphMainApp_GridViewPermitResults"]//tr[' + str(v) + ']/td[4]/a/@href').extract_first() ).strip().replace('None', '') link_val = linkk.split("'") link_val = link_val[1] form_data_fs = { 'ctl00$cphMainApp$ToolkitScriptManager1': 'ctl00$cphMainApp$upSearch|' + str(link_val), '__EVENTTARGET': str(link_val), '__VIEWSTATE': response.text.split('__VIEWSTATE|')[1].split('|')[0], '__EVENTVALIDATION': response.text.split('__EVENTVALIDATION|')[1].split('|') [0], '__VIEWSTATEGENERATOR': 'DCE30F85', '__ASYNCPOST': 'true' } parse_first = yield scrapy.FormRequest( url= 'ttps://gcs.douglascountywi.org/gcswebportal/search.aspx', method='POST', dont_filter=True, formdata=form_data_fs) insp_rep_1 = parse_first.xpath( "//table[@id='ctl00_cphMainApp_SearchDetailsPermit_PermitDetails1_gvApplications']//tr/td[1]/a/@href" ).extract() if len(insp_rep_1) > 0: for u in range(2, len(insp_rep_1) + 2): permit_subtype_1 = str( parse_first.xpath( "//table[@id='ctl00_cphMainApp_SearchDetailsPermit_PermitDetails1_gvApplications']//tr[" + str(u) + "]/td[1]/a/text()"). extract_first()).strip().replace('None', '') permit_lic_desc_1 = permit_subtype_1 if permit_lic_desc_1 == '' or permit_lic_desc_1 == None: permit_lic_desc_1 = 'Building Permit' appl_no_1 = parse_first.xpath( "//table[@id='ctl00_cphMainApp_SearchDetailsPermit_PermitDetails1_gvApplications']//tr[" + str(u) + "]/td[2]/a/text()").extract_first() issue_no_1 = parse_first.xpath( "//table[@id='ctl00_cphMainApp_SearchDetailsPermit_PermitDetails1_gvApplications']//tr[" + str(u) + "]/td[3]/a/text()").extract_first() issue_date_1 = parse_first.xpath( "//table[@id='ctl00_cphMainApp_SearchDetailsPermit_PermitDetails1_gvApplications']//tr[" + str(u) + "]/td[4]/a/text()").extract_first() permit_lic_status_1 = parse_first.xpath( "//table[@id='ctl00_cphMainApp_SearchDetailsPermit_PermitDetails1_gvApplications']//tr[" + str(u) + "]/td[6]/a/text()").extract_first() permit_lic_fee_1 = parse_first.xpath( "//table[@id='ctl00_cphMainApp_SearchDetailsPermit_PermitDetails1_gvApplications']//tr[" + str(u) + "]/td[7]/a/text()").extract_first() data_pass = { 'prop type': 'Real Estate', 'parcel number': '', 'municipality': '', 'location_address_string': 'WI', 'mixed_name': '', 'dba_name': '', 'mixed_subtype': '', 'mail_address_string': '', 'permit_subtype': permit_subtype_1, 'permit_lic_no': appl_no_1, 'issue #': issue_no_1, 'permit_lic_eff_date': issue_date_1, 'permit_lic_status': permit_lic_status_1, 'permit_lic_fee': permit_lic_fee_1, 'inspection_type': '', 'inspection_date': '', 'inspection_subtype': '', 'inspection_pass_fail': '', 'person': '', 'inspector_comments': '', 'permit_lic_desc': permit_lic_desc_1, 'permit_type': 'building_permit', 'sourceName': '', 'url': '', 'ingestion_timestamp': '' } yield self.save_to_csv(response, **data_pass) else: csv_row = str(parcelss_noo) + "\n" self.csv.write(csv_row) link_no = str( response.xpath( '//*[@id="ctl00_cphMainApp_GridViewPermitResults"]//tr[' + str(v) + ']/td[2]/a/@href').extract_first()) link_val = link_no.split("'") link_val = link_val[1] form_data2 = { 'ctl00$cphMainApp$ToolkitScriptManager1': 'ctl00$cphMainApp$upSearch|' + str(link_val), '__EVENTTARGET': str(link_val), '__VIEWSTATE': response.text.split('__VIEWSTATE|')[1].split('|')[0], '__EVENTVALIDATION': response.text.split('__EVENTVALIDATION|')[1].split('|') [0], '__VIEWSTATEGENERATOR': 'DCE30F85', '__ASYNCPOST': 'true' } parse_third = yield scrapy.FormRequest( url= 'https://gcs.douglascountywi.org/gcswebportal/search.aspx', method='POST', dont_filter=True, formdata=form_data2) form_data3 = { 'ctl00$cphMainApp$ToolkitScriptManager1': 'ctl00$cphMainApp$SearchDetailsPermit$upSearchDetails|ctl00$cphMainApp$SearchDetailsPermit$LinkButtonParcel', '__EVENTTARGET': 'ctl00$cphMainApp$SearchDetailsPermit$LinkButtonParcel', '__VIEWSTATE': parse_third.text.split('__VIEWSTATE|')[1].split('|') [0], '__EVENTVALIDATION': parse_third.text.split('__EVENTVALIDATION|')[1].split( '|')[0], '__VIEWSTATEGENERATOR': 'DCE30F85', '__ASYNCPOST': 'true' } parse_property = yield scrapy.FormRequest( url= 'https://gcs.douglascountywi.org/gcswebportal/search.aspx', method='POST', dont_filter=True, formdata=form_data3) prop_type = parse_property.xpath( '//*[@id="ctl00_cphMainApp_SearchDetailsPermit_RecordTitle1_LabelTitlePropType"]/text()' ).extract_first() parcel_no = parse_property.xpath( '//*[@id="ctl00_cphMainApp_SearchDetailsPermit_RecordTitle1_LabelTitleParcelNum"]/text()' ).extract_first() municipality = parse_property.xpath( '//*[@id="ctl00_cphMainApp_SearchDetailsPermit_RecordTitle1_LabelMunicipality"]/text()' ).extract_first() prop_address = str( parse_property.xpath( '//*[@id="ctl00_cphMainApp_SearchDetailsPermit_RecordTitle1_LabelTitlePropAddr"]/text()' ).extract_first()).strip().replace('None', '').replace( 'NONE', '') if prop_address == '' or prop_address == None: prop_address = 'WI' else: prop_address = prop_address + ', WI' valss = parse_property.xpath( '//*[@id="ctl00_cphMainApp_SearchDetailsPermit_RecordTitle1_lblBillingAddress"]/text()' ).extract() if len(valss) == 3: name1 = [valss[0]] address = valss[1] + ', ' + valss[2] elif len(valss) > 3: name1 = valss[0:-2] address = valss[-2] + ', ' + valss[-1] address = address.replace(' WI ', ', WI ') for mixed_name1 in name1: if ' dba ' in mixed_name1 or '(DBA ' in mixed_name1 or 'DBA:' in mixed_name1 or ' DBA ' in mixed_name1: mixed_name1 = mixed_name1.replace( ' dba ', ' DBA ').replace('(DBA ', ' DBA ').replace( 'DBA:', ' DBA ') mixed_name1 = mixed_name1.split(' DBA ') mixed_name = mixed_name1[0] dba_name = mixed_name1[1] else: mixed_name = mixed_name1 dba_name = '' # data_pass={'prop type':prop_type,'parcel number':parcel_no,'municipality':municipality,'location_address_string':prop_address,'mixed_name':mixed_name,'dba_name':'','mixed_subtype':'Owner','mail_address_string':address,'permit_subtype':'','permit_lic_no':'','issue #':'','permit_lic_eff_date':'','permit_lic_status':'','permit_lic_fee':'','inspection_type':'','inspection_date':'','inspection_subtype':'','inspection_pass_fail':'','person':'','inspector_comments':'','permit_lic_desc':'','permit_type':'building_permit','sourceName':'','url':'','ingestion_timestamp':''} # yield self.save_to_csv(response, **data_pass) insp_rep = parse_third.xpath( "//table[@id='ctl00_cphMainApp_SearchDetailsPermit_PermitDetails1_gvApplications']//tr/td[1]/a/@href" ).extract() if len(insp_rep) > 0: for u in range(2, len(insp_rep) + 2): detail_link = parse_third.xpath( "//table[@id='ctl00_cphMainApp_SearchDetailsPermit_PermitDetails1_gvApplications']//tr[" + str(u) + "]/td[1]/a/@href").extract_first() permit_subtype = str( parse_third.xpath( "//table[@id='ctl00_cphMainApp_SearchDetailsPermit_PermitDetails1_gvApplications']//tr[" + str(u) + "]/td[1]/a/text()"). extract_first()).strip().replace('None', '') permit_lic_desc = permit_subtype if permit_lic_desc == '' or permit_lic_desc == None: permit_lic_desc = 'Building Permit' appl_no = parse_third.xpath( "//table[@id='ctl00_cphMainApp_SearchDetailsPermit_PermitDetails1_gvApplications']//tr[" + str(u) + "]/td[2]/a/text()").extract_first() issue_no = parse_third.xpath( "//table[@id='ctl00_cphMainApp_SearchDetailsPermit_PermitDetails1_gvApplications']//tr[" + str(u) + "]/td[3]/a/text()").extract_first() issue_date = parse_third.xpath( "//table[@id='ctl00_cphMainApp_SearchDetailsPermit_PermitDetails1_gvApplications']//tr[" + str(u) + "]/td[4]/a/text()").extract_first() permit_lic_status = parse_third.xpath( "//table[@id='ctl00_cphMainApp_SearchDetailsPermit_PermitDetails1_gvApplications']//tr[" + str(u) + "]/td[6]/a/text()").extract_first() permit_lic_fee = str( parse_third.xpath( "//table[@id='ctl00_cphMainApp_SearchDetailsPermit_PermitDetails1_gvApplications']//tr[" + str(u) + "]/td[7]/a/text()"). extract_first()).strip().replace('$0.00', '') link = detail_link.split("'") link = link[1] form_data4 = { 'ctl00$cphMainApp$ToolkitScriptManager1': 'ctl00$cphMainApp$SearchDetailsPermit$upSearchDetails|' + str(link), '__EVENTTARGET': str(link), '__VIEWSTATE': parse_third.text.split( '__VIEWSTATE|')[1].split('|')[0], '__EVENTVALIDATION': parse_third.text.split( '__EVENTVALIDATION|')[1].split('|')[0], '__VIEWSTATEGENERATOR': 'DCE30F85', '__ASYNCPOST': 'true' } parse_insp = yield scrapy.FormRequest( url= 'https://gcs.douglascountywi.org/gcswebportal/search.aspx', method='POST', dont_filter=True, formdata=form_data4) lenn = parse_insp.xpath( '//*[@id="ctl00_cphMainApp_SearchDetailsPermit_PermitDetails1_FormView1_gvActivities"]//tr/td[1]' ).extract() if len( lenn ) > 0 and 'There are no activities associate' not in lenn[ 0]: for i in range(2, len(lenn) + 2): insp_date = parse_insp.xpath( '//*[@id="ctl00_cphMainApp_SearchDetailsPermit_PermitDetails1_FormView1_gvActivities"]//tr[' + str(i) + ']/td[1]/text()').extract_first() insp_type = 'building_inspection' insp_subtype = parse_insp.xpath( '//*[@id="ctl00_cphMainApp_SearchDetailsPermit_PermitDetails1_FormView1_gvActivities"]//tr[' + str(i) + ']/td[2]/span/text()').extract_first() inspection_pass_fail = parse_insp.xpath( '//*[@id="ctl00_cphMainApp_SearchDetailsPermit_PermitDetails1_FormView1_gvActivities"]//tr[' + str(i) + ']/td[3]/span/text()').extract_first() person = str( parse_insp.xpath( '//*[@id="ctl00_cphMainApp_SearchDetailsPermit_PermitDetails1_FormView1_gvActivities"]//tr[' + str(i) + ']/td[4]/span/text()'). extract_first()).strip().replace( 'None', '') # print ('!)@@@@@@@@@@@@2',person) inspector_comments = str( parse_insp.xpath( '//*[@id="ctl00_cphMainApp_SearchDetailsPermit_PermitDetails1_FormView1_gvActivities"]//tr[' + str(i) + ']/td[5]/text()'). extract_first()).strip().replace( 'None', '') inspector_comments = re.sub( '\s+', ' ', inspector_comments) data_pass = { 'prop type': prop_type, 'parcel number': parcel_no, 'municipality': municipality, 'location_address_string': prop_address, 'mixed_name': mixed_name, 'dba_name': dba_name, 'mixed_subtype': 'Owner', 'mail_address_string': address, 'permit_subtype': permit_subtype, 'permit_lic_no': appl_no, 'issue #': issue_no, 'permit_lic_eff_date': issue_date, 'permit_lic_status': permit_lic_status, 'permit_lic_fee': permit_lic_fee, 'inspection_type': insp_type, 'inspection_date': insp_date, 'inspection_subtype': insp_subtype, 'inspection_pass_fail': inspection_pass_fail, 'person': person, 'inspector_comments': inspector_comments, 'permit_lic_desc': permit_lic_desc, 'permit_type': 'building_permit', 'sourceName': '', 'url': '', 'ingestion_timestamp': '' } yield self.save_to_csv( response, **data_pass) else: data_pass = { 'prop type': prop_type, 'parcel number': parcel_no, 'municipality': municipality, 'location_address_string': prop_address, 'mixed_name': mixed_name, 'dba_name': dba_name, 'mixed_subtype': 'Owner', 'mail_address_string': address, 'permit_subtype': permit_subtype, 'permit_lic_no': appl_no, 'issue #': issue_no, 'permit_lic_eff_date': issue_date, 'permit_lic_status': permit_lic_status, 'permit_lic_fee': permit_lic_fee, 'inspection_type': '', 'inspection_date': '', 'inspection_subtype': '', 'inspection_pass_fail': '', 'person': '', 'inspector_comments': '', 'permit_lic_desc': permit_lic_desc, 'permit_type': 'building_permit', 'sourceName': '', 'url': '', 'ingestion_timestamp': '' } yield self.save_to_csv(response, **data_pass) next_page = str( response.xpath( "//*[@id='pager']/table[@border='0']//tr[1]/td/span/following::td[1]/a/@href" ).extract_first()) if 'Page$' in next_page: next_link = next_page.split("'") next_link = next_link[-2] form_data_last = { 'ctl00$cphMainApp$ToolkitScriptManager1': 'ctl00$cphMainApp$upSearch|ctl00$cphMainApp$GridViewPermitResults', '__EVENTTARGET': 'ctl00$cphMainApp$GridViewPermitResults', '__EVENTARGUMENT': str(next_link), '__VIEWSTATE': response.text.split('__VIEWSTATE|')[1].split('|')[0], '__EVENTVALIDATION': response.text.split('__EVENTVALIDATION|')[1].split('|')[0], '__VIEWSTATEGENERATOR': 'DCE30F85', '__ASYNCPOST': 'true' } yield scrapy.FormRequest( url='https://gcs.douglascountywi.org/gcswebportal/search.aspx', method='POST', dont_filter=True, formdata=form_data_last, callback=self.parse_second) if len(self.search_element) > 0: yield scrapy.Request(url=response.url, callback=self.parse_things, dont_filter=True) def save_to_csv(self, response, **data_pass): if data_pass['permit_lic_desc'] == '' or data_pass[ 'permit_lic_desc'] == None: data_pass['permit_lic_desc'] = 'Building Permit' il = ItemLoader(item=WiDouglasBuildingPermitsSpiderItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value( 'url', 'https://gcs.douglascountywi.org/gcswebportal/search.aspx') il.add_value('sourceName', 'WI_Douglas_Building_Permits') il.add_value('inspector_comments', data_pass['inspector_comments']) il.add_value('mixed_name', data_pass['mixed_name']) il.add_value('permit_subtype', data_pass['permit_subtype']) il.add_value('permit_lic_desc', data_pass['permit_lic_desc']) il.add_value('mixed_subtype', data_pass['mixed_subtype']) il.add_value('permit_type', data_pass['permit_type']) il.add_value('permit_lic_fee', data_pass['permit_lic_fee']) il.add_value('inspection_pass_fail', data_pass['inspection_pass_fail']) il.add_value('permit_lic_status', data_pass['permit_lic_status']) il.add_value('location_address_string', data_pass['location_address_string']) il.add_value('dba_name', data_pass['dba_name']) il.add_value('inspection_subtype', data_pass['inspection_subtype']) il.add_value('permit_lic_eff_date', data_pass['permit_lic_eff_date']) il.add_value('permit_lic_no', data_pass['permit_lic_no']) il.add_value('prop type', data_pass['prop type']) il.add_value('inspection_date', data_pass['inspection_date']) il.add_value('inspection_type', data_pass['inspection_type']) il.add_value('person', data_pass['person']) il.add_value('municipality', data_pass['municipality']) il.add_value('issue #', data_pass['issue #']) il.add_value('parcel number', data_pass['parcel number']) il.add_value('mail_address_string', data_pass['mail_address_string']) return il.load_item()
class WaWhatcomBellinghamBuildingPermitsSpider(CommonSpider): name = '973_wa_whatcom_bellingham_building_permits' allowed_domains = ['cob.org'] start_urls = ['https://www.cob.org/epermits/Search/permit.aspx'] custom_settings = { 'FILE_NAME':Utils.getRundateFileName('AI-973_Permits_Buildings_WA_Whatcom_Bellingham_CurationReady'), 'JIRA_ID':'AI_973', 'DOWNLOAD_DELAY':5, 'COOKIES_ENABLED':True, 'TRACKING_OPTIONAL_PARAMS':['record_number'], 'COOKIES_DEBUG':True, 'HTTPCACHE_ENABLED':False, # 'JOBDIR' : CustomSettings.getJobDirectory('WaWhatcomBellinghamBuildingPermitsSpider'), 'TOP_HEADER':{'apn/pin': 'APN/PIN','approved date': 'Approved Date','contractor_address_string': '','contractor_dba': '','dba_name': '','finaled date': 'Finaled Date','inspection_date': 'Completed','inspection_pass_fail': 'Result','inspection_subtype': 'Type','inspection_type': '','location_address_string': 'Address','mixed_contractor_name': 'Contractor','mixed_name': '','mixed_subtype': '','parcel #': 'Parcel #','permit_applied_date': 'Applied Date','Status': 'Status','permit_lic_eff_date': 'Issued Date','permit_lic_exp_date': 'Expiration Date','permit_lic_fee': 'Fees','permit_lic_no': 'Permit #','permit_subtype': 'Permit Type','permit_type': '','person_address_string': '','property type': 'Property Type','subtype': 'Subtype','permit_lic_desc':'Project Description'}, 'FIELDS_TO_EXPORT':['permit_lic_no','permit_subtype','subtype','property type','permit_lic_desc','Status','permit_applied_date','approved date','permit_lic_eff_date','finaled date','permit_lic_exp_date','location_address_string','apn/pin','parcel #','permit_lic_fee','mixed_name','dba_name','mixed_subtype','person_address_string','mixed_contractor_name','contractor_dba','contractor_address_string','inspection_subtype','inspection_date','inspection_pass_fail','inspection_type','permit_type','url','sourceName','ingestion_timestamp'], 'NULL_HEADERS':['subtype', 'property type', 'Status','approved date', 'finaled date', 'apn/pin', 'parcel #'] } search_element = [] check_first = True end_date='' SearchCriteria=['BAN', 'BLD', 'CCI', 'CLR', 'CMB', 'CON', 'DEM', 'ELB', 'ELE', 'FAP', 'FCP', 'FHP', 'FLP', 'FPP', 'FSP', 'GRD', 'LUA', 'LUH', 'MEC', 'MPR', 'OCC', 'PBW', 'PLM', 'SGN', 'SPI', 'STM', 'STP', 'STR', 'TMP', 'UFC', 'WQP'] def parse(self, response): headers={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Content-Length': '11564', 'Content-Type': 'application/x-www-form-urlencoded', 'DNT': '1', 'Host': 'www.cob.org', 'Origin': 'https://www.cob.org', 'Referer': 'https://www.cob.org/epermits/login.aspx?lt=either&rd=~/Search/permit.aspx', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' } form_data={ '__LASTFOCUS':'', 'RadScriptManager1_TSM': ';;System.Web.Extensions, Version=4.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35:en-US:1453655a-6b8d-49b1-94c2-f77a352f5241:ea597d4b:b25378d2;Telerik.Web.UI, Version=2013.2.717.40, Culture=neutral, PublicKeyToken=121fae78165ba3d4:en-US:0507d587-20ad-4e22-b866-76bd3eaee2df:16e4e7cd:ed16cbdc:f7645509:24ee1bba:92fe8ea0:f46195d3:fa31b949:874f8ea2:19620875:490a9d4e:bd8f85e4:b7778d6c', '__EVENTTARGET': '', '__EVENTARGUMENT': '', '__VIEWSTATE': response.xpath('//*[@id="__VIEWSTATE"]/@value').extract_first(), '__VIEWSTATEGENERATOR': response.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value').extract_first(), '__EVENTVALIDATION':response.xpath('//*[@id="__EVENTVALIDATION"]/@value').extract_first(), 'ctl00_ucLogin_rwmLogin_ClientState': '', 'ctl00$ucLogin$hfDashboardRedirect': 'https://www.cob.org/epermits/dashboard.aspx', 'ctl00$ucLogin$hfCartRedirect': 'https://www.cob.org/epermits/ShoppingCart.aspx', 'ctl00$ucLogin$hfViewEditProfile': 'static value', 'ctl00$ucLogin$hfHome': 'https://www.cob.org/epermits/default.aspx', 'ctl00$ucLogin$hfSetupAnAccountForPublic': 'https://www.cob.org/epermits/publicUserAccount.aspx?action=npa', 'ctl00$ucLogin$hfSetupAnAccountForContractor': 'https://www.cob.org/epermits/RegistrationConfirmation.aspx', 'ctl00$ucLogin$hfContractorCSLBVerification': 'DISABLED', 'ctl00$ucLogin$ddlSelLogin': '******', 'ctl00$ucLogin$txtLoginId': 'Username', 'ctl00_ucLogin_txtLoginId_ClientState': '{"enabled":true,"emptyMessage":"Username","validationText":"","valueAsString":"","lastSetTextBoxValue":"Username"}', 'ctl00$ucLogin$RadTextBox2': 'Password', 'ctl00_ucLogin_RadTextBox2_ClientState': '{"enabled":true,"emptyMessage":"Password","validationText":"","valueAsString":"","lastSetTextBoxValue":"Password"}', 'ctl00$ucLogin$txtPassword': '', 'ctl00_ucLogin_txtPassword_ClientState': '{"enabled":true,"emptyMessage":"","validationText":"","valueAsString":"","lastSetTextBoxValue":""}', 'ctl00$hfGoogleKey': 'UA-5831706-1', 'ctl00$cplMain$txtPublicUserName': '******', 'ctl00$cplMain$txtPublicPassword': '******', 'ctl00$cplMain$btnPublicLogin': '******', 'ctl00$cplMain$txtStLicNo': '', 'ctl00$cplMain$txtContractorPassword': '', 'ctl00$cplMain$txtEnterKeySubmit': '' } yield FormRequest(url='https://www.cob.org/epermits/login.aspx?lt=either&rd=%7e%2fSearch%2fpermit.aspx',formdata=form_data,headers=headers,callback= self.search, dont_filter=True) def search(self,response): if self.check_first: self.check_first = False self.search_element=self.SearchCriteria[int(self.start):int(self.end)] if len(self.search_element) > 0: param = self.search_element.pop(0) form_data={ 'ctl00$RadScriptManager1': 'ctl00$RadScriptManager1|ctl00$cplMain$btnSearch', 'RadScriptManager1_TSM': ';;System.Web.Extensions, Version=4.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35:en-US:1453655a-6b8d-49b1-94c2-f77a352f5241:ea597d4b:b25378d2;Telerik.Web.UI, Version=2013.2.717.40, Culture=neutral, PublicKeyToken=121fae78165ba3d4:en-US:0507d587-20ad-4e22-b866-76bd3eaee2df:16e4e7cd:ed16cbdc:f7645509:24ee1bba:92fe8ea0:f46195d3:fa31b949:874f8ea2:19620875:490a9d4e:bd8f85e4:b7778d6c:58366029:e330518b:1e771326:8e6f0d33:6a6d718d;', 'ctl00$ucLogin$hfDashboardRedirect': 'https://www.cob.org/epermits/dashboard.aspx', 'ctl00$ucLogin$hfCartRedirect': 'https://www.cob.org/epermits/ShoppingCart.aspx', 'ctl00$ucLogin$hfViewEditProfile': 'static value', 'ctl00$ucLogin$hfHome': 'https://www.cob.org/epermits/default.aspx', 'ctl00$ucLogin$hfSetupAnAccountForPublic': 'https://www.cob.org/epermits/publicUserAccount.aspx?action=npa', 'ctl00$ucLogin$hfSetupAnAccountForContractor': 'https://www.cob.org/epermits/RegistrationConfirmation.aspx', 'ctl00$ucLogin$hfContractorCSLBVerification': 'DISABLED', 'ctl00$ucLogin$ddlSelLogin': '******', 'ctl00$ucLogin$txtLoginId': 'Username', 'ctl00_ucLogin_txtLoginId_ClientState': '{"enabled":true,"emptyMessage":"Username","validationText":"","valueAsString":"","lastSetTextBoxValue":"Username"}', 'ctl00$ucLogin$RadTextBox2': 'Password', 'ctl00_ucLogin_RadTextBox2_ClientState': '{"enabled":true,"emptyMessage":"Password","validationText":"","valueAsString":"","lastSetTextBoxValue":"Password"}', 'ctl00_ucLogin_txtPassword_ClientState': '{"enabled":true,"emptyMessage":"","validationText":"","valueAsString":"","lastSetTextBoxValue":""}', 'ctl00$hfGoogleKey': 'UA-5831706-1', 'ctl00$cplMain$hfActivityMode':'', 'ctl00$cplMain$ddSearchBy': 'Permit_Main.PERMIT_NO', 'ctl00$cplMain$ddSearchOper': 'CONTAINS', 'ctl00$cplMain$txtSearchString': str(param), 'ctl00_cplMain_rgSearchRslts_ClientState': '{"selectedIndexes":["0"],"selectedCellsIndexes":[],"unselectableItemsIndexes":[],"reorderedColumns":[],"expandedItems":[],"expandedGroupItems":[],"expandedFilterItems":[],"deletedItems":[],"hidedColumns":[],"showedColumns":[],"groupColsState":{},"hierarchyState":{},"popUpLocations":{},"draggedItemsIndexes":[]}', 'ctl00_cplMain_tcSearchDetails_ClientState': '{"selectedIndexes":["0"],"logEntries":[],"scrollState":{}}', '__EVENTTARGET': 'ctl00$cplMain$btnSearch', '__VIEWSTATE':response.xpath('//*[@id="__VIEWSTATE"]/@value').extract_first(), '__VIEWSTATEGENERATOR': response.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value').extract_first(), '__ASYNCPOST': 'true', 'RadAJAXControlID': 'ctl00_RadAjaxManager1' } currentPage = 1 yield FormRequest(url=self.start_urls[0], headers={'Referer': self.start_urls},formdata=form_data,callback= self.parse_list, dont_filter=True,meta={'currentPage':currentPage,'param':param}) def parse_list(self,response): currentPage = response.meta['currentPage'] param=response.meta['param'] responseValues = response.text.split('|') viewstate = "" viewgenerator = "" for i in range(len(responseValues)): if responseValues[i] == "__VIEWSTATE": viewstate = responseValues[i+1] if responseValues[i] == "__VIEWSTATEGENERATOR": viewgenerator = responseValues[i+1] if responseValues[i] == "__VIEWSTATEGENERATOR": viewgenerator = responseValues[i+1] table=response.xpath('//table[@id="ctl00_cplMain_rgSearchRslts_ctl00"]//tr')[7:] for ind,a in enumerate(table): record_number=a.xpath('td[1]/text()').extract_first() address=a.xpath('td[2]/span/text()').extract_first() parcel_number=a.xpath('td[4]/span/text()').extract_first() form_data_2={ 'ctl00$RadScriptManager1': 'ctl00$ctl00$cplMain$rgSearchRsltsPanel|ctl00$cplMain$rgSearchRslts', 'RadScriptManager1_TSM': ";;System.Web.Extensions, Version=4.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35:en-US:1453655a-6b8d-49b1-94c2-f77a352f5241:ea597d4b:b25378d2;Telerik.Web.UI, Version=2013.2.717.40, Culture=neutral, PublicKeyToken=121fae78165ba3d4:en-US:0507d587-20ad-4e22-b866-76bd3eaee2df:16e4e7cd:ed16cbdc:f7645509:24ee1bba:92fe8ea0:f46195d3:fa31b949:874f8ea2:19620875:490a9d4e:bd8f85e4:b7778d6c:58366029:e330518b:1e771326:8e6f0d33:6a6d718d;", 'ctl00$ucLogin$hfDashboardRedirect': 'https://www.cob.org/epermits/dashboard.aspx', 'ctl00$ucLogin$hfCartRedirect': 'https://www.cob.org/epermits/ShoppingCart.aspx', 'ctl00$ucLogin$hfViewEditProfile': 'static value', 'ctl00$ucLogin$hfHome': 'https://www.cob.org/epermits/default.aspx', 'ctl00$ucLogin$hfSetupAnAccountForPublic': 'https://www.cob.org/epermits/publicUserAccount.aspx?action=npa', 'ctl00$ucLogin$hfSetupAnAccountForContractor': 'https://www.cob.org/epermits/RegistrationConfirmation.aspx', 'ctl00$ucLogin$hfContractorCSLBVerification': 'DISABLED', 'ctl00$ucLogin$ddlSelLogin': '******', 'ctl00$ucLogin$txtLoginId': 'Username', 'ctl00_ucLogin_txtLoginId_ClientState': '{"enabled":true,"emptyMessage":"Username","validationText":"","valueAsString":"","lastSetTextBoxValue":"Username"}', 'ctl00$ucLogin$RadTextBox2': 'Password', 'ctl00_ucLogin_RadTextBox2_ClientState': '{"enabled":true,"emptyMessage":"Password","validationText":"","valueAsString":"","lastSetTextBoxValue":"Password"}', 'ctl00_ucLogin_txtPassword_ClientState': '{"enabled":true,"emptyMessage":"","validationText":"","valueAsString":"","lastSetTextBoxValue":""}', 'ctl00$hfGoogleKey': 'UA-5831706-1', 'ctl00$cplMain$ddSearchBy': 'Permit_Main.PERMIT_NO', 'ctl00$cplMain$ddSearchOper': 'CONTAINS', 'ctl00$cplMain$txtSearchString': str(param), 'ctl00_cplMain_rgSearchRslts_ClientState': '{"selectedIndexes":["0"],"selectedCellsIndexes":[],"unselectableItemsIndexes":[],"reorderedColumns":[],"expandedItems":[],"expandedGroupItems":[],"expandedFilterItems":[],"deletedItems":[],"hidedColumns":[],"showedColumns":[],"groupColsState":{},"hierarchyState":{},"scrolledPosition":"0,0","popUpLocations":{},"draggedItemsIndexes":[]}', 'ctl00_cplMain_tcSearchDetails_ClientState': '{"selectedIndexes":["2"],"logEntries":[],"scrollState":{}}', '__EVENTTARGET': 'ctl00$cplMain$rgSearchRslts', '__EVENTARGUMENT': 'RowClick;'+str(ind), '__VIEWSTATE': viewstate, '__VIEWSTATEGENERATOR': viewgenerator, '__ASYNCPOST': 'true', 'RadAJAXControlID': 'ctl00_RadAjaxManager1' } yield FormRequest(url=self.start_urls[0],formdata=form_data_2,callback= self.get_data, dont_filter=True,meta={'record_number':record_number,'address':address,'parcel_number':parcel_number,'optional':{'record_number':record_number}}) total_pages=response.xpath("//tr[5]/td/span[@class='font12 italic']/text()").extract_first() if total_pages: pages=str(total_pages).split('of')[1] page=pages.strip() headers={ 'Accept': '*/*', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'www.cob.org', 'Origin': 'https://www.cob.org', 'Referer': 'https://www.cob.org/epermits/Search/permit.aspx', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'} if int(currentPage) < int(page)+1: form_data_3={ 'ctl00$RadScriptManager1': 'ctl00$ctl00$cplMain$rgSearchRsltsPanel|ctl00$cplMain$rgSearchRslts', 'RadScriptManager1_TSM': ';;System.Web.Extensions, Version=4.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35:en-US:1453655a-6b8d-49b1-94c2-f77a352f5241:ea597d4b:b25378d2;Telerik.Web.UI, Version=2013.2.717.40, Culture=neutral, PublicKeyToken=121fae78165ba3d4:en-US:0507d587-20ad-4e22-b866-76bd3eaee2df:16e4e7cd:ed16cbdc:f7645509:24ee1bba:92fe8ea0:f46195d3:fa31b949:874f8ea2:19620875:490a9d4e:bd8f85e4:b7778d6c:58366029:e330518b:1e771326:8e6f0d33:6a6d718d;', 'ctl00$ucLogin$hfDashboardRedirect': 'https://www.cob.org/epermits/dashboard.aspx', 'ctl00$ucLogin$hfCartRedirect': 'https://www.cob.org/epermits/ShoppingCart.aspx', 'ctl00$ucLogin$hfViewEditProfile': 'static value', 'ctl00$ucLogin$hfHome': 'https://www.cob.org/epermits/default.aspx', 'ctl00$ucLogin$hfSetupAnAccountForPublic': 'https://www.cob.org/epermits/publicUserAccount.aspx?action=npa', 'ctl00$ucLogin$hfSetupAnAccountForContractor': 'https://www.cob.org/epermits/RegistrationConfirmation.aspx', 'ctl00$ucLogin$hfContractorCSLBVerification': 'DISABLED', 'ctl00$ucLogin$ddlSelLogin': '******', 'ctl00$ucLogin$txtLoginId': 'Username', 'ctl00_ucLogin_txtLoginId_ClientState': '{"enabled":true,"emptyMessage":"Username","validationText":"","valueAsString":"","lastSetTextBoxValue":"Username"}', 'ctl00$ucLogin$RadTextBox2': 'Password', 'ctl00_ucLogin_RadTextBox2_ClientState': '{"enabled":true,"emptyMessage":"Password","validationText":"","valueAsString":"","lastSetTextBoxValue":"Password"}', 'ctl00_ucLogin_txtPassword_ClientState': '{"enabled":true,"emptyMessage":"","validationText":"","valueAsString":"","lastSetTextBoxValue":""}', 'ctl00$hfGoogleKey': 'UA-5831706-1', 'ctl00$cplMain$activeTab': '1', 'ctl00$cplMain$ddSearchBy': 'Permit_Main.PERMIT_NO', 'ctl00$cplMain$ddSearchOper': 'CONTAINS', 'ctl00$cplMain$txtSearchString': str(param), 'ctl00_cplMain_tcSearchDetails_ClientState': '{"selectedIndexes":["1"],"logEntries":[],"scrollState":{}}', '__EVENTTARGET': 'ctl00$cplMain$rgSearchRslts', '__EVENTARGUMENT': 'FireCommand:ctl00$cplMain$rgSearchRslts$ctl00;Page;next', '__LASTFOCUS': '', '__VIEWSTATE': viewstate, '__VIEWSTATEGENERATOR': viewgenerator, '__ASYNCPOST': 'true', 'RadAJAXControlID': 'ctl00_RadAjaxManager1' } yield FormRequest(url=self.start_urls[0],formdata=form_data_3,headers=headers,callback= self.parse_list, dont_filter=True,meta={'currentPage': int(currentPage)+1,'param':param}) if len(self.search_element) > 0: yield scrapy.Request(url=self.start_urls[0], callback=self.search, dont_filter=True) @inline_requests def get_data(self,response): meta={} meta["mixed_name"]=meta["mixed_subtype"]=meta["person_address_string"]=meta["mixed_contractor_name"]=meta["contractor_address_string"]=meta["inspection_subtype"]=meta["inspection_type"]=meta["inspection_pass_fail"]=meta["completed_date"]=meta['address']=meta["apn_pin"]=meta["parent_project"]=meta["property_type"]=meta["permit_lic_exp_date"]=meta["finaled_date"]=meta["approved_date"]=meta["permit_lic_eff_date"]=meta["permit_applied_date"]=meta["permit_lic_status"]=meta["permit_lic_desc"]=meta['permit_subtype']=meta['permit_lic_type']=meta['parcel_number']=meta['record_number']='' address=response.meta['address'] meta['parcel_number']=response.meta['parcel_number'] meta['record_number']=response.meta['record_number'] meta['permit_lic_type']=response.xpath("//tr/td[(contains(span,'Type:'))]/following-sibling::td/span/text()").extract_first() meta['permit_subtype']=response.xpath("//tr/td[(contains(span,'Subtype:'))]/following-sibling::td/span/text()").extract_first() if response.xpath("//tr/td[(contains(span,'Project Description:'))]/following-sibling::td/span/text()").extract_first()=='': meta["permit_lic_desc"]=meta["permit_subtype"] else: meta["permit_lic_desc"]=response.xpath("//tr/td[(contains(span,'Project Description:'))]/following-sibling::td/span/text()").extract_first() meta["permit_lic_status"]=response.xpath("//tr/td[(contains(span,'Status:'))]/following-sibling::td/span/text()").extract_first() meta["permit_applied_date"]=response.xpath("//tr/td[(contains(span,'Applied Date:'))]/following-sibling::td/span/text()").extract_first() meta["permit_lic_eff_date"]=response.xpath("//tr/td[(contains(span,'Issued Date:'))]/following-sibling::td/span/text()").extract_first() meta["approved_date"]=response.xpath("//tr/td[(contains(span,'Approved Date:'))]/following-sibling::td/span/text()").extract_first() meta["finaled_date"]=response.xpath("//tr/td[(contains(span,'Finaled Date:'))]/following-sibling::td/span/text()").extract_first() meta["permit_lic_exp_date"]=response.xpath("//tr/td[(contains(span,'Expiration Date:'))]/following-sibling::td/span/text()").extract_first() meta["property_type"]=response.xpath("//tr/td[(contains(span,'Property Type'))]/following-sibling::td/span/text()").extract_first() meta["parent_project"]=response.xpath('//*[@id="cplMain_ctl02_ctl02_tableParentProjects"]//tr[2]/td[2]/a/text()').extract_first() meta["apn_pin"]=response.xpath("//tr/td[(contains(span,'APN/PIN:'))]/following-sibling::td/a/text()").extract_first() City_State_Zip=response.xpath("//tr/td[(contains(span,'City/State/Zip:'))]/following-sibling::td/span/text()").extract_first() if address: if address and City_State_Zip: meta['address']=address+', '+City_State_Zip elif address: meta['address']=address+', WA' else: meta['address']='WA' meta['permit_lic_fee']=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl05_rgFeeDetails')]/td[@class='ellipsis'][2]/span/text()").extract_first() meta["contact_len"]=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl04_rgContactInfo')]/td[@class='ellipsis'][1]/text()").extract() yield self.save_to_csv(response,**meta) if meta["contact_len"]: meta["contact_len"]=meta["contact_len"] for i in range(1,int(len(meta["contact_len"]))+1): meta["mixed_sub"]=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl04_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][1]/text()").extract_first() if str(meta["mixed_sub"]).strip()=='CONTRACTOR': meta["mixed_name"]=meta["mixed_subtype"]=meta["person_address_string"]=meta["inspection_subtype"]=meta["inspection_type"]=meta["inspection_pass_fail"]=meta["completed_date"]='' meta["mixed_contractor_name"]=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl04_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][2]/text()").extract_first() meta["contractor_address_string"]=str(response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl04_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][3]/text()").extract_first()).strip()+', '+str(response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl04_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][4]/text()").extract_first()).strip() yield self.save_to_csv(response,**meta) else: meta["mixed_contractor_name"]=meta["contractor_address_string"]=meta["inspection_subtype"]=meta["inspection_type"]=meta["inspection_pass_fail"]=meta["completed_date"]='' meta["mixed_name"]=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl04_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][2]/text()").extract_first() meta["mixed_subtype"]=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl04_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][1]/text()").extract_first() person_add1=str(response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl04_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][3]/text()").extract_first()).strip() person_add2=str(response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl04_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][4]/text()").extract_first()).strip() if person_add1 and person_add1: meta["person_address_string"]=person_add1+', '+person_add2 elif person_add1: meta["person_address_string"]=person_add1+', WA' else: meta["person_address_string"]='WA' yield self.save_to_csv(response,**meta) else: meta["contact_len"]=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl06_rgContactInfo')]/td[@class='ellipsis'][1]/text()").extract() if meta["contact_len"]: for i in range(1,int(len(meta["contact_len"]))+1): meta["mixed_sub"]=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl06_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][1]/text()").extract_first() if str(meta["mixed_sub"]).strip()=='CONTRACTOR': meta["mixed_name"]=meta["mixed_subtype"]=meta["person_address_string"]=meta["inspection_subtype"]=meta["inspection_type"]=meta["inspection_pass_fail"]=meta["completed_date"]='' meta["mixed_contractor_name"]=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl06_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][2]/text()").extract_first() meta["contractor_address_string"]=str(response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl06_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][3]/text()").extract_first()).strip()+', '+str(response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl06_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][4]/text()").extract_first()).strip() yield self.save_to_csv(response,**meta) else: meta["mixed_contractor_name"]=meta["contractor_address_string"]=meta["inspection_subtype"]=meta["inspection_type"]=meta["inspection_pass_fail"]=meta["completed_date"]='' meta["mixed_name"]=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl06_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][2]/text()").extract_first() meta["mixed_subtype"]=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl06_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][1]/text()").extract_first() person_add1=str(response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl06_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][3]/text()").extract_first()).strip() person_add2=str(response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl06_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][4]/text()").extract_first()).strip() if person_add1 and person_add1: meta["person_address_string"]=person_add1+', '+person_add2 elif person_add1: meta["person_address_string"]=person_add1+', WA' else: meta["person_address_string"]='WA' yield self.save_to_csv(response,**meta) else: meta["contact_len"]=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl08_rgContactInfo')]/td[@class='ellipsis'][1]/text()").extract() for i in range(1,int(len(meta["contact_len"]))+1): meta["mixed_sub"]=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl08_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][1]/text()").extract_first() if str(meta["mixed_sub"]).strip()=='CONTRACTOR': meta["mixed_name"]=meta["mixed_subtype"]=meta["person_address_string"]=meta["inspection_subtype"]=meta["inspection_type"]=meta["inspection_pass_fail"]=meta["completed_date"]='' meta["mixed_contractor_name"]=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl08_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][2]/text()").extract_first() meta["contractor_address_string"]=str(response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl08_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][3]/text()").extract_first()).strip()+', '+str(response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl08_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][4]/text()").extract_first()).strip() yield self.save_to_csv(response,**meta) else: meta["mixed_contractor_name"]=meta["contractor_address_string"]=meta["inspection_subtype"]=meta["inspection_type"]=meta["inspection_pass_fail"]=meta["completed_date"]='' meta["mixed_name"]=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl08_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][2]/text()").extract_first() meta["mixed_subtype"]=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl08_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][1]/text()").extract_first() person_add1=str(response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl08_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][3]/text()").extract_first()).strip() person_add2=str(response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl08_rgContactInfo')]["+str(i)+"]/td[@class='ellipsis'][4]/text()").extract_first()).strip() if person_add1 and person_add1: meta["person_address_string"]=person_add1+', '+person_add2 elif person_add1: meta["person_address_string"]=person_add1+', WA' else: meta["person_address_string"]='WA' yield self.save_to_csv(response,**meta) insp_len=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl06_rgInspectionInfo_ctl00')]/td[@class='ellipsis'][1]/text()").extract() if insp_len: for j in range(1,int(len(insp_len))+1): meta["mixed_name"]=meta["mixed_subtype"]=meta["person_address_string"]=meta["mixed_contractor_name"]=meta["contractor_address_string"]='' meta["inspection_subtype"]=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl06_rgInspectionInfo_ctl00')]["+str(j)+"]/td[@class='ellipsis'][1]/text()").extract_first() meta["inspection_type"]='' if meta["inspection_subtype"]=='': meta["inspection_type"]='' else: meta["inspection_type"]='building_inspection' meta["inspection_pass_fail"]=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl06_rgInspectionInfo_ctl00')]["+str(j)+"]/td[@class='ellipsis'][2]/text()").extract_first() meta["completed_date"]=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl06_rgInspectionInfo_ctl00')]["+str(j)+"]/td[@class='ellipsis'][4]/text()").extract_first() yield self.save_to_csv(response,**meta) else: insp_len=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl08_rgInspectionInfo_ctl00')]/td[@class='ellipsis'][1]/text()").extract() for j in range(1,int(len(insp_len))+1): meta["mixed_name"]=meta["mixed_subtype"]=meta["person_address_string"]=meta["mixed_contractor_name"]=meta["contractor_address_string"]='' meta["inspection_subtype"]=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl08_rgInspectionInfo_ctl00')]["+str(j)+"]/td[@class='ellipsis'][1]/text()").extract_first() meta["inspection_type"]='' if meta["inspection_subtype"]=='': meta["inspection_type"]='' else: meta["inspection_type"]='building_inspection' meta["inspection_pass_fail"]=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl08_rgInspectionInfo_ctl00')]["+str(j)+"]/td[@class='ellipsis'][2]/text()").extract_first() meta["completed_date"]=response.xpath("//tr[contains(@id,'ctl00_cplMain_ctl08_rgInspectionInfo_ctl00')]["+str(j)+"]/td[@class='ellipsis'][4]/text()").extract_first() yield self.save_to_csv(response,**meta) def save_to_csv(self,response,**meta): il = ItemLoader(item=WaWhatcomBellinghamBuildingPermitsSpiderItem()) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'WA_Whatcom_Bellingham_Building_Permits') il.add_value('url', 'https://www.cob.org/epermits/Search/permit.aspx') il.add_value('permit_lic_no', meta['record_number']) il.add_value('permit_subtype', meta['permit_lic_type']) il.add_value('subtype', meta['permit_subtype']) il.add_value('property type', meta["property_type"]) if meta["permit_lic_desc"]: meta["permit_lic_desc"]=meta["permit_lic_desc"] else: meta["permit_lic_desc"]='Building Permit' il.add_value('permit_lic_desc', meta["permit_lic_desc"]) il.add_value('Status', meta["permit_lic_status"]) il.add_value('permit_applied_date',meta["permit_applied_date"]) il.add_value('approved date', meta["approved_date"]) il.add_value('permit_lic_eff_date', meta["permit_lic_eff_date"]) il.add_value('finaled date', meta["finaled_date"]) il.add_value('permit_lic_exp_date', meta["permit_lic_exp_date"]) il.add_value('location_address_string',meta['address']) il.add_value('apn/pin',meta["apn_pin"]) il.add_value('parcel #',meta['parcel_number']) il.add_value('permit_lic_fee',meta['permit_lic_fee']) il.add_value('mixed_name',self._getDBA(meta['mixed_name'])[0]) il.add_value('dba_name',self._getDBA(meta['mixed_name'])[1]) il.add_value('mixed_subtype',meta["mixed_subtype"]) il.add_value('person_address_string',meta["person_address_string"]) il.add_value('mixed_contractor_name',self._getDBA(meta['mixed_contractor_name'])[0]) il.add_value('contractor_dba',self._getDBA(meta['mixed_contractor_name'])[1]) il.add_value('contractor_address_string',meta["contractor_address_string"]) il.add_value('inspection_subtype',meta["inspection_subtype"]) il.add_value('inspection_date',meta["completed_date"]) il.add_value('inspection_pass_fail',meta["inspection_pass_fail"]) il.add_value('inspection_type',meta["inspection_type"]) il.add_value('permit_type', 'building_permit') return il.load_item()
class WyPhysicianLicensesSpider(CommonSpider): name = '1991_wy_physician_licenses' allowed_domains = ['glsuite.us'] start_urls = [ 'https://wybomprod.glsuite.us/GLSuiteWeb/Clients/WYBOM/Public/Licenseesearch.aspx?SearchType=Physician' ] custom_settings = { 'FILE_NAME': Utils.getRundateFileName( 'AI-1991_Licenses_Physician_WY_CurationReady'), 'JIRA_ID': 'AI_1991', 'COOKIES_ENABLED': True, 'COOKIES_DEBUG': True, 'HTTPCACHE_ENABLED': False, 'RANDOM_PROXY_DISABLED': False, # 'JOBDIR' : CustomSettings.getJobDirectory('wy_physician_licenses'), 'TOP_HEADER': { 'board certification': 'Board Certification', 'company_name': 'Licensee Name', 'company_phone': 'Phone', 'dba_name': '', 'disciplinary actions': 'Disciplinary Actions', 'location_address_string': 'Office Address', 'mail_address_string': '', 'permit_lic_desc': '', 'permit_lic_eff_date': 'Date Licensed', 'permit_lic_exp_date': 'Expiration Date', 'permit_lic_no': 'License Number', 'permit_lic_status': 'License Status', 'permit_subtype': 'Specialty', 'permit_type': '', 'reactivation date': 'Reactivation Date', 'sub-specialty': 'Sub-Specialty', 'violation_description': 'Disciplinary Summary', 'violation_type': '' }, 'FIELDS_TO_EXPORT': [ 'company_name', 'dba_name', 'location_address_string', 'mail_address_string', 'company_phone', 'permit_lic_no', 'permit_lic_eff_date', 'reactivation date', 'permit_lic_exp_date', 'permit_lic_status', 'board certification', 'permit_subtype', 'sub-specialty', 'disciplinary actions', 'violation_description', 'violation_type', 'permit_lic_desc', 'permit_type', 'sourceName', 'url', 'ingestion_timestamp', ], 'NULL_HEADERS': [ 'sub-specialty', 'board certification', 'disciplinary actions', 'reactivation date' ] } search_element = [] option = [] check_first = True year = '' number = [] yr = '' years = [] def __init__(self, start=None, end=None, startnum=None, endnum=None, proxyserver=None, *a, **kw): super(WyPhysicianLicensesSpider, self).__init__(start, end, proxyserver=None, *a, **kw) self.year = SearchCriteria.strRange(self.start, self.end) def parse(self, response): if self.check_first: self.check_first = False # self.number = SearchCriteria.numberRange(self.startmm,self.endmm,1) self.number = response.xpath( '//*[@id="bodyContent_ddlSpecialty"]/option/@value').extract( )[1:] self.yr = self.year.pop(0) if len(self.number) > 0: app = self.number.pop(0) form_data = { '__VIEWSTATE': response.xpath( '//*[@id="__VIEWSTATE"]/@value').extract_first(), '__VIEWSTATEGENERATOR': response.xpath( '//*[@id="__VIEWSTATEGENERATOR"]/@value').extract_first(), '__EVENTVALIDATION': response.xpath( '//*[@id="__EVENTVALIDATION"]/@value').extract_first(), 'ctl00$bodyContent$txtLicNum': '', 'ctl00$bodyContent$txtLastName': str(self.yr), # 'ctl00$bodyContent$txtLastName':'Allen', 'ctl00$bodyContent$txtFirstName': '', 'ctl00$bodyContent$txtCity': '', 'ctl00$bodyContent$ddlState': '', 'ctl00$bodyContent$ddlSpecialty': str(app), # 'ctl00$bodyContent$ddlSpecialty':'', 'ctl00$bodyContent$ddlBoardCert': '', 'ctl00$bodyContent$btnSubmit': 'Perform Search', } print("________________________________", form_data['ctl00$bodyContent$ddlSpecialty']) print("________________________________", form_data['ctl00$bodyContent$txtLastName']) yield scrapy.FormRequest.from_response(response, formdata=form_data, callback=self.parse_two, method='POST', dont_filter=True) def parse_two(self, response): company_name = permit_lic_no = company_phone = location_address_string = permit_lic_exp_date = permit_lic_eff_date = permit_lic_status = reactivation_date = board_certificate = specilaity = sub_specialty = disciplinary = permit_subtype = '' name = response.xpath( "//table[@id='bodyContent_tblResults']").extract_first() name1 = name.split( 'background-color:LightGrey;border-color:DarkGray;border-width:4px;border-style:Solid;' ) if name else '' for val in name1: value = val.split('</tr>') if 'href' in val: href = val.split('href=')[1].split(">")[0].replace('"', '') for value1 in value: if value1 and 'Licensee Name' in value1: company_name = self.data_clean(value1) print('______________________________________-', company_name) elif value1 and 'License Number' in value1: permit_lic_no = self.data_clean(value1) elif value1 and 'Office Address and Phone' in value1: location = self.data_clean(value1) match = re.search(r'\(?[\d]+\)?[\s][\d]+[-][\d]+', location) if match: company_phone = match.group() location_address_string = location[:location.index( company_phone)] else: location_address_string = location # location_address_string=location[:location.index(company_phone)] company_phone = '' elif value1 and 'Date Licensed' in value1: permit_lic_eff_date = self.data_clean(value1) print( "----------------------------------------------------", permit_lic_eff_date) elif value1 and 'License Status' in value1: permit_lic_exp = self.data_clean(value1) if permit_lic_exp: date_match = re.search( r'[\d]{1,3}[/][\d]{1,3}[/][\d]{1,4}', permit_lic_exp) permit_lic_status = re.split( r';', permit_lic_exp)[1].replace('Board Certified', '') permit_lic_exp_date = date_match.group() else: permit_lic_exp_date = '' elif value1 and 'Reactivation Date' in value1: reactivation_date = self.data_clean(value1) elif value1 and 'Board Certification' in value1: board_certificate = self.data_clean(value1).replace( 'Board Certified:', '') elif value1 and 'Sub-Specialty' in value1: sub_specialty = self.data_clean(value1) elif value1 and 'Specialty' in value1: permit_subtype = self.data_clean(value1) elif value1 and 'Disciplinary Actions' in value1: disciplinary = self.data_clean(value1) il = ItemLoader(item=WyPhysicianLicensesSpiderItem(), response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'WY_Physician_Licenses') il.add_value( 'url', 'https://wybomprod.glsuite.us/GLSuiteWeb/Clients/WYBOM/Public/Licenseesearch.aspx?SearchType=Physician' ) il.add_value('permit_type', 'medical_license') il.add_value('board certification', board_certificate) il.add_value('dba_name', '') il.add_value('permit_lic_status', permit_lic_status) il.add_value('permit_lic_no', permit_lic_no) il.add_value('permit_lic_eff_date', permit_lic_eff_date) il.add_value('permit_subtype', permit_subtype) il.add_value('violation_type', 'health_violation') il.add_value('sub-specialty', sub_specialty) il.add_value('reactivation date', reactivation_date) il.add_value( 'location_address_string', location_address_string if location_address_string else 'WY') il.add_value('permit_lic_desc', 'Physician License for ' + str(company_name)) if self.df(company_name): il.add_value( 'violation_description', self.df(company_name).replace( 'Disciplinary Summary', '')) il.add_value('disciplinary actions', 'Yes') il.add_value('mail_address_string', location_address_string) else: il.add_value('violation_description', '') il.add_value('disciplinary actions', '') il.add_value('mail_address_string', '') il.add_value('company_phone', company_phone) il.add_value('permit_lic_exp_date', permit_lic_exp_date) il.add_value('company_name', company_name) yield il.load_item() if len(self.number) > 0: yield scrapy.Request(url=self.start_urls[0], callback=self.parse, dont_filter=True) elif len(self.year) > 0: self.check_first = True yield scrapy.Request(url=self.start_urls[0], callback=self.parse, dont_filter=True) def df(self, vaal): import tabula import re df = tabula.read_pdf( 'https://da26d420-a-84cef9ff-s-sites.googlegroups.com/a/wyo.gov/wyomedboard/files-1/Discipline%20Report%209-18-2019.pdf?attachauth=ANoY7coROsLSnrNTPX6i5XMCJgj5my472u6SWrlUT0NPQzlvu15U_nr_XdVgDBv7Va7l4Xw1KQdt97dmsXhzbUeEEsYASJskIW477VWbQ_GNGI257-6PeH3X0DoUKBtQhtmsJY2dOlMqZI54S0KI0cZoi8_9L4jj_wS-lJTn2dGna7nA9o0PliCJ-DTgT5s0vWlB0f16qwzVIxHabDYEQWTA5wUsZQtmgz9rL9DXc43Eh4PsFaqdqwM%3D&attredirects=0', area=[2.678, 59.67, 783.743, 589.815], columns=[175], pages='all', guess=False, encoding='ISO-8859-1', pandas_options={ 'header': None }).fillna('') df.columns = ['a', 'b'] start_index = df[df['a'] == 'Licensee Name'].index.tolist() index_list = [i for i in range(start_index[0])] df.drop(df.index[index_list], inplace=True) li2 = [] for _, row in df.fillna('').iterrows(): li2.extend(row) results = map(str, li2) v = re.compile(r'^Licensee Name.*$') match = list(filter(v.match, results)) if match: df[1] = df.apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1) def fillUniqueNums(v): if fillUniqueNums.change: fillUniqueNums.unique_num += 1 fillUniqueNums.change = False if 'Licensee Name' in v[1]: fillUniqueNums.change = True return str(fillUniqueNums.unique_num) fillUniqueNums.change = False fillUniqueNums.unique_num = 1 df[0] = df.apply(lambda v: fillUniqueNums(v), axis=1) df[0] = df[0].shift(-1) df = df[[0, 1]] df = df.groupby(0)[1].apply(list) dict1 = {} dict2 = {} for data in df: overall = ' '.join(data) na = re.split("Mailing Address", overall)[0] + re.search( "Mailing Address", overall).group() name = na.replace('Mailing Address', '').replace('Licensee Name', '').strip().replace('.', ',') date_match = re.search(r'[\d]{5}[\s]+[a-zA-Z].*', overall) if date_match: ab = date_match.group() zip_code = re.search(r'^[\d]{1,5}', ab) desc = re.split(zip_code.group(), ab)[1] else: date_match = re.search( r'[mM][aA][iI][lL][iI][nN][gG][\s]*[aA][dD][dD][rR][eE][sS][sS].*', overall) if date_match: ab = date_match.group().replace('Mailing Address', '') desc = ab dict1[name] = desc dict2.update(dict1) if vaal: description = dict2.get(vaal) else: description = '' return description def data_clean(self, value): if value: try: clean_tags = re.compile('<.*?>') desc_list = re.sub('\s+', ' ', re.sub(clean_tags, '', value)) desc_list_rep = desc_list.replace('Licensee Name', '').replace( 'License Number', '').replace('Office Address and Phone', '').replace( 'Date Licensed', '').replace('Reactivation Date', '').replace( 'License Status', '').replace('Board Certification', '').replace( 'Specialty', '').replace('Sub-Specialty', '').replace( 'Disciplinary Actions', '').replace('Disciplinary Summary', '').replace('Sub-', '') return desc_list_rep.strip() except: return '' else: return ''
class IlKankakeeFoodInspectionsSpider(CommonSpider): name = '1381_il_kankakee_food_inspections' allowed_domains = ['kankakeehealth.org'] start_urls = [ 'http://www.kankakeehealth.org/environmental-health/food-sanitation/food_inspections.html' ] custom_settings = { 'FILE_NAME': Utils.getRundateFileName( 'AI-1381_Inspections_Food_IL_Kankakee_CurationReady'), 'JIRA_ID': 'AI_1381', 'COOKIES_ENABLED': True, 'COOKIES_DEBUG': True, 'HTTPCACHE_ENABLED': False, 'DOWNLOAD_DELAY': 2, 'CONCURRENT REQUESTS': 1, # 'JOBDIR' : CustomSettings.getJobDirectory('IlKankakeeFoodInspectionsSpider'), 'TOP_HEADER': { 'abate_date': '', 'abate_status': 'Compliance Status.1', 'company_name': 'Establishment', 'dba_name': 'Permit Holder', 'inspection_date': '', 'inspection_pass_fail': 'Inspection Result', 'inspection_subtype': 'Purpose of Inspection', 'inspection_type': '', 'inspector_comments': 'Remarks', 'location_address_string': 'Street Address', 'permit_lic_desc': '', 'permit_lic_no': 'License/Permit #', 'permit_type': '', 'risk category': 'Risk Category', 'temperature observations-item/location': 'TEMPERATURE OBSERVATIONS-Item/Location', 'violation category': 'Violation Category', 'violation_date': '', 'violation_description': 'OBSERVATIONS AND CORRECTIVE ACTIONS or Deficiencies/Remarks/Corrections', 'violation_rule': 'Compliance Status', 'violation_rule_id': 'Item Number', 'violation_subtype': '', 'violation_type': '' }, 'FIELDS_TO_EXPORT': [ 'company_name', 'permit_lic_no', 'location_address_string', 'dba_name', 'risk category', 'inspection_date', 'inspection_subtype', 'inspection_pass_fail', 'inspector_comments', 'inspection_type', 'violation_date', 'violation_rule_id', 'violation_rule', 'abate_date', 'abate_status', 'violation category', 'violation_subtype', 'temperature observations-item/location', 'violation_description', 'violation_type', 'permit_lic_desc', 'permit_type', 'sourceName', 'url', 'ingestion_timestamp', ], 'NULL_HEADERS': [ 'temperature observations-item/location', 'violation category', 'risk category' ] } def __init__(self, start=None, end=None, startnum=None, endnum=None, proxyserver=None, *a, **kw): super(IlKankakeeFoodInspectionsSpider, self).__init__(start, end, proxyserver=None, *a, **kw) import csv import os current_file_path = os.path.abspath(os.path.dirname( __file__)) + '/AI_1381_permit_no_list_{}_{}.csv'.format( self.start, self.end) self.csv = open(current_file_path, "w") columnTitleRow = "permit_lic_no\n" self.csv.write(columnTitleRow) def parse(self, response): yield scrapy.Request(url='http://il.healthinspections.us/kankakee/', dont_filter=True, callback=self.parse_link) def parse_link(self, response): link = response.xpath( '//div/table//tr/td[3]/table[1]//tr[2]/td/div[4]/a/@href' ).extract_first() url_join = 'http://il.healthinspections.us/kankakee/' + str(link) yield scrapy.Request(url=url_join, dont_filter=True, callback=self.parse_get) slink = [] def parse_get(self, response): meta = {} if len(self.slink) == 0: self.slink = response.xpath( '//*[@id="innersearchbox"]/table//tr/td/a/@href').extract() if len(self.slink) > 0: search_link = self.slink.pop(0) print('========================================>', search_link) link_join = 'http://il.healthinspections.us/kankakee/' + str( search_link) yield scrapy.Request(url=link_join, dont_filter=True, callback=self.parse_get1, meta={'page': '1'}) @inline_requests def parse_get1(self, response): meta = response.meta val_link = response.xpath('//div/table//tr[1]/td[1]/a/@href').extract() for val in val_link: if 'estab.cfm' in val: val_join = 'http://il.healthinspections.us/kankakee/' + str( val) # val_join='' parse_get2 = yield scrapy.Request(url=val_join, dont_filter=True) table = parse_get2.xpath('//div/table//tr[1]/td[1]/div')[1:] for i in table: company_name = i.xpath('div[1]/b/text()').extract_first() meta['company_name'] = self._getDBA(company_name)[0] csv_row = str(company_name) + "\n" meta['permit_lic_desc'] = 'Restaurant License for ' + str( company_name) address = i.xpath('div[2]/text()').extract() clean_tags = re.compile('<.*?>') meta['location_address_string'] = ','.join( re.sub('\s+', ' ', re.sub(clean_tags, ' ', desc)) for desc in address) date = i.xpath('div[3]/text()').extract() meta['inspection_date'] = ''.join( re.sub('\s+', ' ', re.sub(clean_tags, ' ', desc)) for desc in date) ins_link = i.xpath('div/a/@href').extract() for ins in ins_link: if ins: ins_join = 'http://il.healthinspections.us/' + str( ins.replace('../', '/')) parse_get3 = yield scrapy.Request(url=ins_join, dont_filter=True) method2 = parse_get3.xpath( "//*[contains(text(),'Establishment #')]/u/text()" ).extract_first() met = parse_get3.xpath( "//*[contains(text(),'License/Permit #')]/ancestor::td/text()" ).extract() method1 = ''.join( str(self.data_clean(desc)) for desc in met) meta['dba_name'] = '' check = 0 if method1: check = 1 meta['permit_lic_no'] = method1 dba = parse_get3.xpath( "//*[contains(text(),'Permit Holder')]/ancestor::td/text()" ).extract() ris = parse_get3.xpath( "//*[contains(text(),'Risk Category')]/ancestor::td/text()" ).extract() meta['risk'] = ''.join( str(self.data_clean(desc)) for desc in ris) inspection_sub = parse_get3.xpath( "//*[contains(text(),'Purpose of Inspection')]/ancestor::td/text()" ).extract() meta['inspection_subtype'] = ''.join( str(self.data_clean(desc)) for desc in inspection_sub) meta['inspection_pass_fail'] = '' correcive_action = parse_get3.xpath( '//*[contains(text(), "OBSERVATIONS AND CORRECTIVE ACTIONS")]/following::table' )[1:] check_val = value2 = check_val1 = des = '' dict1 = {} dict2 = {} for cor in correcive_action: descrip = [] vio_comment = '' value = self.data_clean( cor.xpath( 'tr/td[1]/text()').extract_first()) if value and len( value ) > 0 and 'Inspection' not in value and ':' not in value: des = cor.xpath('tr/td[2]').extract() vio_comment1 = ''.join( str(self.data_clean(desc)) for desc in des) vio_comment = vio_comment1 if value: check_val = value if check_val in dict2.keys(): old_desc_list = dict2[ check_val] old_desc_list.append( vio_comment) descrip = old_desc_list dict2[check_val] = descrip else: descrip.append(vio_comment) dict2[check_val] = descrip else: pass inspector_comments = parse_get3.xpath( '//*[contains(text(), "Inspection Comments")]/following::td[1]/text()' ).extract() meta['inspector_comments'] = ''.join( str(self.data_clean(desc)) for desc in inspector_comments) meta['inspection_type'] = 'health_inspection' table_path = parse_get3.xpath( "//*[contains(text(), 'TEMPERATURE OBSERVATIONS')]/following::table//tr/td/text()" ).extract() tem = ' '.join( str(self.data_clean(desc)) for desc in table_path) if tem and 'Item Number' in tem: meta['temperature'] = tem.split( 'Item Number')[0] else: meta['temperature'] = '' food = parse_get3.xpath( '//div[1]/div/table[4]//tr/td/table//tr') good = parse_get3.xpath( '//div[1]/div/table[7]//tr/td/table//tr') rule_list = [] check_yield = 0 if food: for food1 in food: meta['dba_name'] = ''.join( str(self.data_clean(desc)) for desc in dba) out = food1.xpath( 'td[3]/span[@style="padding-left:4px;padding-right:4px;border:solid 1px red;-webkit-border-radius: 25px;-moz-border-radius: 25px;border-radius: 25px;"]/text()' ).extract_first() if out and 'OUT' in out: check_yield = 1 meta['rule_id'] = self.data_clean( food1.xpath('td[1]/text()'). extract_first()) rule_list.append(meta['rule_id']) meta['rule'] = food1.xpath( 'td[6]/text()').extract_first( ) meta['violation_description'] = '' if meta['rule_id'] in dict2.keys(): vio_Des = dict2[ meta['rule_id']] else: vio_Des = '' for vio_Des1 in vio_Des: meta[ 'violation_description'] = vio_Des1 meta[ 'violation_subtype'] = 'Critical' meta[ 'violation_type'] = 'health_violation' meta[ 'violation_category'] = 'FOODBORNE ILLNESS RISK FACTORS AND PUBLIC HEALTH INTERVENTIONS' meta['violation_date'] = meta[ 'inspection_date'] meta['abate_status'] = meta[ 'abate_date'] = '' if 'Correct By:' in meta[ 'violation_description']: meta[ 'abate_status'] = 'corrected on-site during inspection' meta['abate_date'] = self.format_date( meta[ 'violation_description'] .split('(Correct By:') [1].split(')')[0]) else: meta[ 'abate_status'] = meta[ 'abate_date'] = '' yield self.save_to_csv( response, **meta) meta['dba_name'] = '' dba_namee = self._getDBA( company_name)[1] if dba_namee and len( dba_namee) > 3: meta[ 'dba_name'] = dba_namee yield self.save_to_csv( response, **meta) if good: meta['rule_id'] = meta['rule'] = meta[ 'violation_description'] = meta[ 'violation_type'] = meta[ 'violation_subtype'] = meta[ 'abate_status'] = meta[ 'abate_date'] = '' for good1 in good: meta['dba_name'] = ''.join( str(self.data_clean(desc)) for desc in dba) x = good1.xpath( 'td[2]/text()').extract_first() if x and 'X' in x: check_yield = 1 meta['rule_id'] = self.data_clean( good1.xpath('td[1]/text()'). extract_first()) rule_list.append(meta['rule_id']) meta['rule'] = good1.xpath( 'td[3]/text()').extract_first( ) meta['violation_description'] = '' if meta['rule_id'] in dict2.keys(): vio_Des = dict2[ meta['rule_id']] else: vio_Des = '' for vio_Des1 in vio_Des: meta[ 'violation_description'] = vio_Des1 meta[ 'violation_category'] = 'GOOD RETAIL PRACTICES' meta[ 'violation_subtype'] = 'Non Critical' meta[ 'violation_type'] = 'health_violation' meta['violation_date'] = meta[ 'inspection_date'] meta['abate_status'] = meta[ 'abate_date'] = '' if 'Correct By:' in meta[ 'violation_description']: meta[ 'abate_status'] = 'corrected on-site during inspection' meta['abate_date'] = self.format_date( meta[ 'violation_description'] .split('(Correct By:') [1].split(')')[0]) yield self.save_to_csv( response, **meta) meta['dba_name'] = '' dba_namee = self._getDBA( company_name)[1] if dba_namee and len( dba_namee) > 3: meta[ 'dba_name'] = dba_namee yield self.save_to_csv( response, **meta) for rule in dict2.keys(): if rule in rule_list: pass else: meta['dba_name'] = ''.join( str(self.data_clean(desc)) for desc in dba) check_yield = 1 meta['rule_id'] = rule meta['violation_description'] = dict2[ rule] meta['violation_subtype'] = '' meta[ 'violation_type'] = 'health_violation' meta['violation_date'] = meta[ 'inspection_date'] meta['violation_category'] = meta[ 'abate_status'] = meta[ 'abate_date'] = '' meta['rule'] = '' yield self.save_to_csv( response, **meta) meta['dba_name'] = '' dba_namee = self._getDBA( company_name)[1] if dba_namee and len(dba_namee) > 3: meta['dba_name'] = dba_namee yield self.save_to_csv( response, **meta) if check_yield == 0: meta['dba_name'] = ''.join( str(self.data_clean(desc)) for desc in dba) print('==================check_yield') meta['rule_id'] = '' meta['violation_description'] = '' meta['violation_subtype'] = '' meta['violation_type'] = '' meta['violation_date'] = '' meta['violation_category'] = meta[ 'abate_status'] = meta[ 'abate_date'] = '' meta['rule'] = '' yield self.save_to_csv(response, **meta) meta['dba_name'] = '' dba_namee = self._getDBA(company_name)[1] if dba_namee and len(dba_namee) > 3: meta['dba_name'] = dba_namee yield self.save_to_csv( response, **meta) if method2: check = 1 meta['permit_lic_no'] = method2 meta['inspection_subtype'] = parse_get3.xpath( "//table[1]//tr[1]/td/table[1]//tr/td[3]/font/strong/b[contains(text(),'X')]/following::text()" ).extract_first() check_val = value2 = check_val1 = des = '' dict1 = {} dict2 = {} correcive_action = parse_get3.xpath( '//*[contains(text(), "Remarks and Recommendations for Corrections")]/ancestor::table//tr' )[2:] for cor in correcive_action: descrip = [] value = self.data_clean( cor.xpath( 'td[1]/text()').extract_first()) if value and len(value) > 0: des = cor.xpath('td[2]').extract() add = cor.xpath( 'td[3]/text()').extract_first() vio_com = ''.join( str(self.data_clean(desc)) for desc in des) vio_comment = str(vio_com) + str(add) else: pass if value: check_val = value if check_val in dict2.keys(): old_desc_list = dict2[check_val] old_desc_list.append(vio_comment) descrip = old_desc_list dict2[check_val] = descrip else: descrip.append(vio_comment) dict2[check_val] = descrip meta['risk'] = meta['inspection_pass_fail'] = meta[ 'temperature'] = meta['violation_date'] = meta[ 'rule'] = meta['rule_id'] = meta[ 'abate_date'] = meta['abate_status'] = meta[ 'violation_category'] = meta[ 'violation_subtype'] = meta[ 'violation_description'] = meta[ 'violation_type'] = '' inspector_comments = parse_get3.xpath( "//*[contains(text(),'General Comments')]/following::tr/td[@style='border:2px solid black; padding:3px']/text()" ).extract() meta['inspector_comments'] = ' '.join( str(self.data_clean(desc)) for desc in inspector_comments) meta['inspection_type'] = 'health_inspection' vio_tab = parse_get3.xpath( '//table[1]//tr[1]/td/table[4]//tr') for vio in vio_tab: rule = vio.xpath( '//*[@class="checkX"]/preceding::td[1]/following-sibling::td[3]/text()' ).extract() rule_id = vio.xpath( '//*[@class="checkX"]/preceding::td[1]//text()' ).extract() vio_dict = dict(zip(rule_id, rule)) if len(vio_dict) > 0: for n in vio_dict.keys(): meta['dba_name'] = parse_get3.xpath( '//table[1]//tr[1]/td/table[3]//tr[1]/td[1]/font/strong[contains(text(),"Owner or Operator")]/following::td/text()' ).extract_first() if n in dict2.keys(): vio_Des = dict2[n] meta['rule'] = vio_dict[n] meta['rule_id'] = n else: meta['violation_description'] = '' meta['violation_subtype'] = '' meta[ 'violation_type'] = 'health_violation' meta['violation_date'] = meta[ 'inspection_date'] meta['violation_category'] = meta[ 'abate_status'] = meta[ 'abate_date'] = '' meta['rule'] = vio_dict[n] meta['rule_id'] = n yield self.save_to_csv( response, **meta) meta['dba_name'] = '' dba_namee = self._getDBA( company_name)[1] if dba_namee and len( dba_namee) > 3: meta['dba_name'] = dba_namee yield self.save_to_csv( response, **meta) for vio_Des1 in vio_Des: meta['dba_name'] = parse_get3.xpath( '//table[1]//tr[1]/td/table[3]//tr[1]/td[1]/font/strong[contains(text(),"Owner or Operator")]/following::td/text()' ).extract_first() meta[ 'violation_description'] = vio_Des1 meta['violation_subtype'] = '' meta[ 'violation_type'] = 'health_violation' meta['violation_date'] = meta[ 'inspection_date'] meta['violation_category'] = '' meta['abate_status'] = meta[ 'abate_date'] = '' if 'Onsite' in meta[ 'violation_description']: meta[ 'abate_status'] = 'corrected on-site during inspection' meta['abate_date'] = meta[ 'inspection_date'] else: meta['abate_status'] = meta[ 'abate_date'] = '' meta['violation_description'] = meta[ 'violation_description'].replace( 'Immediate/Onsite', '' ).replace( 'Next Inspection', '' ).replace( 'NEXT INSPECTION', '' ) if meta[ 'violation_description'] and len( meta[ 'violation_description'] ) > 2 else '' yield self.save_to_csv( response, **meta) meta['dba_name'] = '' dba_namee = self._getDBA( company_name)[1] if dba_namee and len( dba_namee) > 3: meta['dba_name'] = dba_namee yield self.save_to_csv( response, **meta) for m in dict2.keys(): if m in vio_dict.keys(): pass else: meta['dba_name'] = parse_get3.xpath( '//table[1]//tr[1]/td/table[3]//tr[1]/td[1]/font/strong[contains(text(),"Owner or Operator")]/following::td/text()' ).extract_first() meta['rule_id'] = m meta[ 'violation_description'] = dict2[ m] meta['violation_description'] = meta[ 'violation_description'].replace( 'Immediate/Onsite', '' ).replace( 'Next Inspection', '' ).replace( 'NEXT INSPECTION', '' ) if meta[ 'violation_description'] and len( meta[ 'violation_description'] ) > 2 else '' meta['violation_subtype'] = '' meta[ 'violation_type'] = 'health_violation' meta['violation_date'] = meta[ 'inspection_date'] meta['violation_category'] = meta[ 'abate_status'] = meta[ 'abate_date'] = '' meta['rule'] = '' yield self.save_to_csv( response, **meta) meta['dba_name'] = '' dba_namee = self._getDBA( company_name)[1] if dba_namee and len( dba_namee) > 3: meta['dba_name'] = dba_namee yield self.save_to_csv( response, **meta) else: meta['dba_name'] = parse_get3.xpath( '//table[1]//tr[1]/td/table[3]//tr[1]/td[1]/font/strong[contains(text(),"Owner or Operator")]/following::td/text()' ).extract_first() meta['violation_date'] = meta['rule'] = meta[ 'rule_id'] = meta['abate_date'] = meta[ 'abate_status'] = meta[ 'violation_category'] = meta[ 'violation_subtype'] = meta[ 'violation_description'] = meta[ 'violation_type'] = '' yield self.save_to_csv(response, **meta) meta['dba_name'] = '' dba_namee = self._getDBA(company_name)[1] if dba_namee and len(dba_namee) > 3: meta['dba_name'] = dba_namee yield self.save_to_csv( response, **meta) if check == 0: break page_val = response.meta['page'] next_pagee = response.xpath( '//table//tr/td/a[@class="buttN"]/b[contains(text(), "' + (str(page_val)) + '")]/following::a/@href').extract_first() print(next_pagee, 'next_pagee@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@', page_val) if next_pagee and 'start=' in next_pagee: main_url = 'http://il.healthinspections.us/kankakee/' + str( next_pagee) yield scrapy.Request(url=main_url, callback=self.parse_get1, dont_filter=True, meta={'page': int(response.meta['page']) + 1}) if len(self.slink) > 0: yield scrapy.Request(url=self.start_urls[0], callback=self.parse_get, dont_filter=True) def data_clean(self, value): if value: try: clean_tags = re.compile('<.*?>') desc_list = re.sub('\s+', ' ', re.sub(clean_tags, ' ', value)) desc_list_rep = desc_list.replace('&', '&') return desc_list_rep.strip() except: return '' else: return '' def save_to_csv(self, response, **meta): il = ItemLoader(item=IlKankakeeFoodInspectionsSpiderItem(), response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'IL_Kankakee_Food_Inspections') il.add_value( 'url', 'http://www.kankakeehealth.org/environmental-health/food-sanitation/food_inspections.html' ) il.add_value('violation_date', meta['violation_date']) il.add_value('permit_lic_no', meta['permit_lic_no']) il.add_value('location_address_string', meta['location_address_string']) il.add_value('inspector_comments', meta['inspector_comments']) il.add_value('inspection_date', meta['inspection_date']) il.add_value('company_name', meta['company_name']) il.add_value('violation_rule_id', meta['rule_id']) il.add_value('violation_subtype', meta['violation_subtype']) il.add_value('inspection_pass_fail', meta['inspection_pass_fail']) il.add_value('violation category', meta['violation_category']) il.add_value('dba_name', meta['dba_name']) il.add_value('inspection_type', meta['inspection_type']) il.add_value('violation_description', meta['violation_description']) il.add_value('risk category', meta['risk']) il.add_value('abate_date', meta['abate_date']) il.add_value('abate_status', meta['abate_status']) il.add_value('temperature observations-item/location', meta['temperature']) il.add_value('inspection_subtype', meta['inspection_subtype']) il.add_value('violation_rule', meta['rule']) il.add_value('permit_lic_desc', meta['permit_lic_desc']) il.add_value('permit_type', 'restaurant_license') il.add_value('violation_type', meta['violation_type']) return il.load_item()
class CtForestPractitionerLicenseSpider(CommonSpider): name = 'ct_forest_practitioner_license' allowed_domains = ['ct.gov'] start_urls = [ 'https://www.depdata.ct.gov/forestry/ForestPractitioner/directry.pdf' ] custom_settings = { 'FILE_NAME': Utils.getRundateFileName( 'Licenses_ForestPractitioner_CT_CurationReady'), 'JIRA_ID': 'AI_710', # 'JOBDIR' : CustomSettings.getJobDirectory('CtForestPractitionerLicenseSpider'), 'TOP_HEADER': { 'extended permit': 'Extended Permit', 'level': 'Level', 'location_address_string': 'Address', 'permit_lic_desc': '', 'permit_lic_exp_date': 'Expiration', 'permit_lic_no': 'Cert. #', 'permit_subtype': '', 'permit_type': '', 'person_name': 'Name', 'person_phone': 'Phone' }, 'FIELDS_TO_EXPORT': [ 'person_name', 'location_address_string', 'person_phone', 'level', 'permit_subtype', 'permit_lic_no', 'permit_lic_exp_date', 'extended permit', 'permit_lic_desc', 'permit_type', 'sourceName', 'url', 'ingestion_timestamp' ], 'NULL_HEADERS': ['level', 'extended permit'] } def parse(self, response): yield scrapy.Request( url= 'https://www.depdata.ct.gov/forestry/ForestPractitioner/directry.pdf', callback=self.parse_pdf, dont_filter=True) def __extractData(self, response): def rolling_group(val): if pd.notnull(val): # if pd.notnull(val) and '/' in val and not 'st' in val: rolling_group.group += 1 return rolling_group.group rolling_group.group = 0 def joinFunc(g, column): col = g[column] joiner = "/" s = joiner.join([str(each) for each in col if pd.notnull(each)]) s = re.sub("(?<=&)" + joiner, " ", s) s = re.sub("(?<=-)" + joiner, " ", s) s = re.sub(joiner * 2, joiner, s) return s def getDf(temp_file, area): return tabula.read_pdf(temp_file, pages='8-27', Stream=True, silent=True, guess=False, columns=[ 95.625, 173.655, 241.74, 315.18, 341.955, 372.555, 437.58, 467.415, 508.725, 597.465 ], encoding='ISO-8859-1', area=area, pandas_options={ 'header': 'infer' }).replace('\r', ' ', regex=True).dropna(how='all') df = getDf( 'https://www.depdata.ct.gov/forestry/ForestPractitioner/directry.pdf', [65.611, 19.89, 731.161, 598.23]) df.columns = [ 'l_name', 'f_name', 'address', 'city', 'state', 'zip', 'phone', 'level', 'cert', 'expiration' ] groups = df.groupby(df['expiration'].apply(rolling_group), as_index=False) groupFunct = lambda g: pd.Series( [joinFunc(g, col) for col in g.columns], index=g.columns) final_df = groups.apply(groupFunct).fillna('') yield final_df.to_dict('records') def parse_pdf(self, response): for row in self.__extractData(response): for col in row: # d = re.search(r"[\d]/[\d]/[\d]$", col['expiration']) # if d: # self.state['items_count'] = self.state.get('items_count', 0) + 1 il = ItemLoader(item=CtForestPractitionerLicenseSpiderItem()) il.default_input_processor = MapCompose( lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value( 'url', 'https://www.depdata.ct.gov/forestry/ForestPractitioner/directry.pdf' ) il.add_value('sourceName', 'CT_Forest_Practitioner_License') il.add_value('person_phone', col['phone']) name = col['f_name'] + ' ' + col['l_name'] il.add_value('person_name', name) if ' ' in col['expiration']: date = col['expiration'].split(' ')[0] e_permit = col['expiration'].split(' ')[1] else: date = col['expiration'] e_permit = '' print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@2", date) il.add_value('permit_lic_exp_date', date) if '490' in e_permit: e_permit = "490- permitted to assist landowners seeking classification of their land as 'Forest Land'" il.add_value('extended permit', e_permit) il.add_value('permit_lic_no', col['cert']) level_desc = col['level'] if level_desc == 'F': level_desc = 'FORESTER' elif level_desc == 'SFPH': level_desc = 'SUPERVISING FOREST PRODUCTS HARVESTER' elif level_desc == 'FPH': level_desc = 'FOREST PRODUCTS HARVESTER' il.add_value('level', col['level']) il.add_value('permit_subtype', level_desc) il.add_value('permit_lic_desc', level_desc) il.add_value('permit_type', 'forester_license') location_address_string = col['address'] + ', ' + col[ 'city'] + ', ' + col['state'] + ' ' + col['zip'] il.add_value('location_address_string', location_address_string) yield il.load_item()
class FlClayBuildingPermitsSpider(CommonSpider): name = '116_fl_clay_building_permits' allowed_domains = ['claycountygov.com'] start_urls = ['https://public.claycountygov.com/PermitSearch/'] # start_urls = ['https://public.claycountygov.com/PermitSearch/#tab=Owner&sortfield=issuedate&sortdirection=D&owner=aa&status=all&page=1&v=285'] # mian_url = ['https://public.claycountygov.com/PermitSearch/'] # page_count = 0 main_url = 'https://public.claycountygov.com/PermitSearch/' single_page_count = 0 custom_settings = { 'FILE_NAME': Utils.getRundateFileName( '116_Permits_Buildings_FL_Clay_CurationReady'), 'JIRA_ID': 'AI_116', 'DOWNLOAD_DELAY': 0.5, 'PROXY_DISABLED': False, 'COOKIES_ENABLED': True, 'COOKIES_DEBUG': True, 'HTTPCACHE_ENABLED': False, # 'JOBDIR' : CustomSettings.getJobDirectory('fl_clay_building_permits'), 'TOP_HEADER': { 'contractor_lic_no': 'Contractor#', 'contractor_lic_type': '', 'inspection_date': '', 'inspection_id': '', 'inspection_pass_fail': '', 'inspection_subtype': '', 'inspection_type': '', 'inspector_comments': '', 'location_address_string': 'Proj Addr', 'mixed_contractor_name': 'Contractor Name', 'mixed_name': 'Owner', 'mixed_subtype': '', 'notes': 'Notes', 'number_of_stories': 'Stories', 'permit_lic_desc': '', 'permit_lic_eff_date': 'Issue Dt', 'permit_lic_no': 'Permit #', 'permit_lic_value': 'Valuation', 'permit_subtype': 'PERMIT TYPE', 'permit_type': '', 'person_address_string': 'Address', 'year_built': 'Actual Year Built' }, 'FIELDS_TO_EXPORT': [ 'permit_lic_no', 'permit_subtype', 'permit_lic_desc', 'location_address_string', 'permit_lic_eff_date', 'notes', 'mixed_name', 'mixed_subtype', 'person_address_string', 'mixed_contractor_name', 'contractor_lic_no', 'contractor_lic_type', 'permit_lic_value', 'number_of_stories', 'year_built', 'inspection_id', 'inspection_date', 'inspection_subtype', 'inspection_pass_fail', 'inspector_comments', 'inspection_type', 'permit_type', 'url', 'sourceName', 'ingestion_timestamp', ], 'NULL_HEADERS': ['notes'] } value = True def parse(self, response): if self.value: self.search_element = SearchCriteria.strRange(self.start, self.end) self.value = False if len(self.search_element) > 0: parm = self.search_element.pop(0) page_count_link = 'https://public.claycountygov.com/permitsearch/API/Search/Count?tab=Owner&sortfield=issuedate&sortdirection=D&owner=' + str( parm) + '&status=all&page=1&v=496' page_count_header = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.9,ta;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Content-Type': 'application/json', # DNT: 1 'Host': 'public.claycountygov.com', 'Referer': 'https://public.claycountygov.com/PermitSearch/', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' } yield scrapy.Request(url=page_count_link, callback=self.parse_user_det, headers=page_count_header, dont_filter=True, meta={"parm": parm}) def parse_user_det(self, response): # self.search_element = SearchCriteria.strRange(self.start,self.end) parm = response.meta['parm'] data_count = json.loads(response.body_as_unicode()) page_count = self.ret_count(data_count) for x in range(1, page_count + 1): start_url_2 = 'https://public.claycountygov.com/permitsearch/API/Search/Permit?tab=Owner&sortfield=issuedate&sortdirection=D&owner=' + str( parm) + '&status=all&page=' + str(x) + '&v=376' head = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.9,ta;q=0.8', # 'Cache-Control': 'max-age=0', # 'Connection': 'keep-alive', 'Content-Type': 'application/json', # 'DNT': '1', 'Host': 'public.claycountygov.com', 'Referer': 'https://public.claycountygov.com/PermitSearch/', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' } yield scrapy.Request(url=start_url_2, callback=self.parse_detail, dont_filter=True, headers=head, meta=response.meta) yield scrapy.Request(url=self.main_url, callback=self.parse, dont_filter=True) # @inline_requests def parse_detail(self, response): jsonres = json.loads(response.body_as_unicode()) data = response.meta if jsonres: for x in jsonres: data['permit_number'] = x['permit_number'] permit_num_link = 'https://public.claycountygov.com/permitsearch/API/Permit/Related?permitnumber=' + str( x["permit_number"]) head = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.9,ta;q=0.8', 'Content-Type': 'application/json', 'Host': 'public.claycountygov.com', 'Referer': 'https://public.claycountygov.com/PermitSearch/', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' } yield scrapy.Request(url=permit_num_link, callback=self.parse_permit, headers=head, dont_filter=True, meta=data) def parse_permit(self, response): item = response.meta jsonres_permit = json.loads(response.body_as_unicode()) permit_notes_link = 'https://public.claycountygov.com/permitsearch/API/Permit/PermitNotes?permitnumber=' + str( item['permit_number']) for x in jsonres_permit: item['permit_lic_no'] = response.meta['permit_number'] item['permit_subtype'] = x['permit_type'] item['permit_lic_desc'] = x['permit_type'] item['location_address_string'] = '' if self.format_date(x["issue_date"]) == '01/01/1': item['permit_lic_eff_date'] = '' else: item['permit_lic_eff_date'] = self.format_date(x["issue_date"]) item['permit_notes'] = '' if x['owner_name']: item['mixed_name'] = self._getDBA(x['owner_name'])[0] item['mixed_subtype'] = 'Owner' else: item['mixed_name'] = '' item['mixed_subtype'] = '' item['person_address_string'] = '' if x['contractor_name'] != "": item['mixed_contractor_name'] = self._getDBA( x['contractor_name'])[0] item['contractor_lic_no'] = x['contractor_number'] item['contractor_lic_type'] = 'contractor_license' else: item['mixed_contractor_name'] = '' item['contractor_lic_no'] = '' item['contractor_lic_type'] = '' item['pin_complete'] = x['pin_complete'] item['permit_lic_value'] = '' item['number_of_stories'] = '' item['year_built'] = '' item['inspection_id'] = '' item['inspection_date'] = '' item['inspection_subtype'] = '' item['inspection_pass_fail'] = '' item['inspector_comments'] = '' item['inspection_type'] = '' permit_notes_header = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.9,ta;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Content-Type': 'application/json', 'Host': 'public.claycountygov.com', 'Referer': 'https://public.claycountygov.com/PermitSearch/', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' } yield scrapy.Request(url=permit_notes_link, callback=self.parse_permit_notes, headers=permit_notes_header, dont_filter=True, meta=item) # @inline_requests def parse_permit_notes(self, response): item = response.meta permit_next_page = 'https://qpublic.schneidercorp.com/Application.aspx?AppID=830&LayerID=15008&PageTypeID=4&KeyValue=' link_pin = item['pin_complete'] permit_next_page = permit_next_page + link_pin permit_notes_json = json.loads(response.body_as_unicode()) notes = [] for x in permit_notes_json: if '*' not in x['note']: p = re.compile(r'<.*?>') notes.append(p.sub('', x['note'])) item['permit_notes'] = ','.join(notes) yield scrapy.Request(url=permit_next_page, callback=self.parse_detail_2, dont_filter=True, meta=item) def parse_detail_2(self, response): item = response.meta inspection_link = 'https://public.claycountygov.com/inspectionscheduler/API/Inspection/Permit/' + str( item['permit_number']) permit_lic_val = response.xpath( '//*[@id="ctlBodyPane_ctl11_ctl01_grdValuation"]//tr/td[2]/text()' ).extract() location_address_string = response.xpath( '//*[@id="ctlBodyPane_ctl00_ctl01_lblPropertyAddress"]/text()' ).extract() location_address_string = ','.join(location_address_string).strip() location_address_string = re.sub(r'(\d+)$', r',FL \1', location_address_string) item['location_address_string'] = location_address_string personal_address_string = response.xpath( '//*[@id="ctlBodyPane_ctl02_ctl01_lstPrimaryOwner_ctl00_lblPrimaryOwnerAddress"]/text()' ).extract() personal_address_string = ''.join( location_address_string).strip().replace("\n", '') if personal_address_string: personal_address_string_re = re.search(r'.*(,FL )[\d]{5}', personal_address_string) if personal_address_string_re is None: item[ 'person_address_string'] = personal_address_string_re.group( ) else: item['person_address_string'] = personal_address_string value = [] for x in permit_lic_val: if x: x = x.replace("$", '').replace(',', '') value.append(int(x)) permit_lic_value = "$" + str(sum(value)) number_of_stories = response.xpath( '//*[@id="ctlBodyPane_ctl04_ctl01_lstBuildings_ctl00_lblStories"]/text()' ).get() year_built = response.xpath( '//*[@id="ctlBodyPane_ctl04_ctl01_lstBuildings_ctl00_Label1"]/text()' ).get() item['permit_lic_value'] = permit_lic_value if number_of_stories == 'None' and number_of_stories == None: item['number_of_stories'] = "" else: item['number_of_stories'] = str(number_of_stories) if year_built == 'None' and year_built == None: item['year_built'] = '' else: item['year_built'] = str(year_built) req_header = { 'Accept': 'application/json', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.9,ta;q=0.8', 'Connection': 'keep-alive', 'Content-Type': 'application/json; charset=utf-8', 'Host': 'public.claycountygov.com', 'Referer': 'https://public.claycountygov.com/inspectionscheduler/', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' } yield scrapy.Request(url=inspection_link, callback=self.parse_inspection_data, headers=req_header, dont_filter=True, meta=item) def parse_inspection_data(self, response): item = response.meta inspect_json = json.loads(response.body_as_unicode()) csv_save_data = { 'permit_lic_no': "", 'permit_subtype': "", 'permit_lic_desc': '', 'location_address_string': '', 'permit_lic_eff_date': '', 'notes': '', 'mixed_name': '', 'mixed_subtype': '', 'person_address_string': '', 'mixed_contractor_name': '', 'contractor_lic_no': '', 'contractor_lic_type': '', 'permit_lic_value': '', 'number_of_stories': '', 'year_built': '', 'inspection_id': '', 'inspection_date': '', 'inspection_subtype': '', 'inspection_pass_fail': '', 'inspector_comments': '', 'inspection_type': '' } csv_save_data['permit_lic_no'] = item['permit_lic_no'] csv_save_data['permit_subtype'] = item['permit_subtype'] csv_save_data['permit_lic_desc'] = item['permit_lic_desc'] if item['location_address_string']: csv_save_data['location_address_string'] = item[ 'location_address_string'] else: csv_save_data['location_address_string'] = 'FL' csv_save_data['permit_lic_eff_date'] = item['permit_lic_eff_date'] csv_save_data['notes'] = item['permit_notes'] csv_save_data['mixed_name'] = item['mixed_name'] csv_save_data['mixed_subtype'] = item['mixed_subtype'] csv_save_data['person_address_string'] = item['person_address_string'] csv_save_data['permit_lic_value'] = item['permit_lic_value'] csv_save_data['number_of_stories'] = item['number_of_stories'] csv_save_data['year_built'] = item['year_built'] yield self.save_to_csv(response, **csv_save_data).load_item() if item['mixed_contractor_name']: csv_save_data['mixed_name'] = '' csv_save_data['mixed_subtype'] = '' csv_save_data['person_address_string'] = '' csv_save_data['mixed_contractor_name'] = item[ 'mixed_contractor_name'] csv_save_data['contractor_lic_no'] = item['contractor_lic_no'] csv_save_data['contractor_lic_type'] = item['contractor_lic_type'] yield self.save_to_csv(response, **csv_save_data).load_item() if len(inspect_json) > 0: for x in inspect_json: if x["InsDesc"] != 'No Inspections': csv_save_data['inspection_id'] = x["PermitNo"] csv_save_data['inspection_date'] = x["DisplayInspDateTime"] csv_save_data['inspection_subtype'] = x['InsDesc'] csv_save_data['inspection_pass_fail'] = x[ "ResultDescription"] csv_save_data['inspector_comments'] = x["Comment"] csv_save_data['inspection_type'] = 'building_inspection' csv_save_data['mixed_contractor_name'] = '' csv_save_data['contractor_lic_no'] = '' csv_save_data['contractor_lic_type'] = '' yield self.save_to_csv(response, **csv_save_data).load_item() def ret_count(self, data): return math.ceil(data / 20) def save_to_csv(self, response, **meta_data): il = ItemLoader(item=FlClayBuildingPermitsSpiderItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('permit_lic_no', str(meta_data['permit_lic_no'])) il.add_value('permit_subtype', meta_data['permit_subtype']) il.add_value('permit_lic_desc', meta_data['permit_lic_desc']) il.add_value('location_address_string', meta_data['location_address_string']) il.add_value('permit_lic_eff_date', meta_data['permit_lic_eff_date']) il.add_value('notes', meta_data['notes']) il.add_value('mixed_name', meta_data['mixed_name']) il.add_value('mixed_subtype', meta_data['mixed_subtype']) il.add_value('person_address_string', meta_data['person_address_string']) il.add_value('mixed_contractor_name', meta_data['mixed_contractor_name']) il.add_value('contractor_lic_no', meta_data['contractor_lic_no']) il.add_value('contractor_lic_type', meta_data['contractor_lic_type']) il.add_value('permit_lic_value', meta_data['permit_lic_value']) if meta_data['number_of_stories'] == 'None': il.add_value('number_of_stories', '') else: il.add_value('number_of_stories', meta_data['number_of_stories']) if meta_data['year_built'] == 'None': il.add_value('year_built', '') else: il.add_value('year_built', meta_data['year_built']) il.add_value('inspection_id', meta_data['inspection_id']) il.add_value('inspection_date', meta_data['inspection_date']) il.add_value('inspection_subtype', meta_data['inspection_subtype']) il.add_value('inspection_pass_fail', meta_data['inspection_pass_fail']) il.add_value('inspector_comments', meta_data['inspector_comments']) il.add_value('inspection_type', meta_data['inspection_type']) il.add_value('permit_type', "building_permit") il.add_value( 'url', "http://www.claycountygov.com/about-us/local-government/public-records-search/permits" ) il.add_value('sourceName', 'FL_Clay_Building_Permits') return il
class AlMedicalLicenseViolationsSpider(CommonSpider): name = '1462_al_medical_license_violations' allowed_domains = ['igovsolution.com'] start_urls = [ 'https://abme.igovsolution.com/online/Lookups/Publiclogfile.aspx' ] custom_settings = { 'FILE_NAME': Utils.getRundateFileName( 'AI-1462_Licenses_Medical_Violation_AL_CurationReady'), 'JIRA_ID': 'AI_1462', 'DOWNLOAD_DELAY': 0.5, 'COOKIES_ENABLED': True, 'COOKIES_DEBUG': True, 'HTTPCACHE_ENABLED': False, # 'JOBDIR' : CustomSettings.getJobDirectory('AlMedicalLicenseViolationsSpider'), 'TOP_HEADER': { 'dba_name': '', 'location_address_string': 'Address', 'permit_lic_desc': '', 'permit_lic_no': 'License #', 'permit_subtype': 'License Type', 'permit_type': '', 'person_name': 'Name', 'violation_date': 'Latest Action Date', 'violation_description': 'Download_link', 'violation_subtype': 'Latest Action Taken', 'violation_type': '' }, 'FIELDS_TO_EXPORT': [ 'person_name', 'dba_name', 'permit_subtype', 'permit_lic_no', 'location_address_string', 'violation_description', 'permit_lic_desc', 'violation_type', 'violation_date', 'violation_subtype', 'permit_type', 'sourceName', 'url', 'ingestion_timestamp' ], 'NULL_HEADERS': [] } def parse(self, response): j = 1 while (j <= 12): data = json.dumps({ 'page': j, 'pageSize': 100, 'sdata': [], 'sortby': "", 'sortexp': "" }) yield scrapy.Request( 'https://abme.igovsolution.com/online/JS_grd/Grid.svc/BindPublicFileDetails', dont_filter=True, method="POST", body=data, headers={ 'Content-Type': 'application/json;charset=UTF-8', 'Referer': 'https://abme.igovsolution.com/online/Lookups/Publiclogfile.aspx', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', 'Origin': 'https://abme.igovsolution.com ', 'Accept': 'application/json, text/javascript, */*; q=0.01' }, callback=self.parse_main_page) j += 1 def parse_main_page(self, response): value1 = json.loads(response.body_as_unicode()) value2 = value1['d'].replace('},{', '}~~{').split('[')[1].split(']')[0] value3 = value2.split('~~') for i in value3: json_acceptable_string = i.replace("\\", "").replace( '"administrative medicine"', "'administrative medicine'") d = json.loads(json_acceptable_string) person_name = d['FullName'] permit_subtype = d['LicenseType'] permit_lic_no = d['License_Number'] if d['Address1'] and d['City'] and d['Zip']: location_address_string = d['Address1'] + ', ' + d[ 'City'] + ' ' + d['Zip'] violation_description = d['Publicfile'] permit_lic_desc = 'Medical License for ' + str(person_name) violation_type = 'professional_violation' vio = d['Action_Date'] if '-' in vio: violation_date = '' else: violation_date = time.strftime( '%m/%d/%Y', time.gmtime(int(re.split('\(|\)', vio)[1]) / 1000.)) violation_subtype = d['ActionTaken'] il = ItemLoader(item=AlMedicalLicenseViolationsSpiderItem(), response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'AL_Medical_License_Violations') il.add_value( 'url', 'https://abme.igovsolution.com/online/Lookups/Publiclogfile.aspx' ) il.add_value('person_name', self._getDBA(person_name)[0]) il.add_value('dba_name', self._getDBA(person_name)[1]) il.add_value('permit_subtype', permit_subtype) il.add_value('permit_lic_no', permit_lic_no) il.add_value('location_address_string', location_address_string) il.add_value('violation_description', violation_description) il.add_value('permit_lic_desc', permit_lic_desc) il.add_value('violation_type', violation_type) il.add_value('violation_date', violation_date) il.add_value('violation_subtype', violation_subtype) il.add_value('permit_type', 'medical_license') yield il.load_item()
class NmSosSpider(CommonSpider): name = 'ai_1432_nm_sos' allowed_domains = ['state.nm.us'] start_urls = [ 'https://portal.sos.state.nm.us/BFS/online/CorporationBusinessSearch' ] site_key = '6LcTYiwUAAAAAFZCzelvolLT0OEXctYN31ZpniI-' custom_settings = { 'FILE_NAME': Utils.getRundateFileName('AI-1432_SOS_NM_CurationReady'), 'JIRA_ID': 'AI_1432', 'COOKIES_ENABLED': True, 'COOKIES_DEBUG': True, 'HTTPCACHE_ENABLED': False, 'CONCURRENT_REQUESTS': 1, 'DOWNLOAD_DELAY': 0.5, # 'JOBDIR' : CustomSettings.getJobDirectory('nm_sos'), 'TOP_HEADER': { 'business purpose': 'Business Purpose', 'company_name': 'Entity Name', 'company_subtype': 'Entity Type', 'creation_date': 'Date of Incorporation in NM/Date of Organization', 'dba_name': 'DBA Name', 'domestic state': 'Domestic State', 'entity_id': 'Business ID', 'location_address_string': 'Mailing Address', 'mixed_name': 'Registered Agent/Officer/General Partner information name', 'mixed_subtype': 'Agent/Officer/Partner Information Contact Title', 'non_profit_indicator': '', 'period of duration': 'Period of Duration', 'permit_type': 'permit_type', 'person_address_string': 'Agent/Officer /General Partner information Address', 'status': 'Status' }, 'FIELDS_TO_EXPORT': [ 'company_name', 'entity_id', 'dba_name', 'company_subtype', 'non_profit_indicator', 'location_address_string', 'status', 'creation_date', 'domestic state', 'period of duration', 'business purpose', 'mixed_subtype', 'mixed_name', 'person_address_string', 'permit_type', 'sourceName', 'url', 'ingestion_timestamp' ], 'NULL_HEADERS': ['domestic state', 'business purpose', 'period of duration'] } def parse(self, response): self.searchkeys = [] if self.start.isalpha() and self.end.isalpha(): self.searchkeys = SearchCriteria.strRange(self.start, self.end) else: self.searchkeys = SearchCriteria.numberRange( self.start, self.end, 1) search = self.searchkeys.pop(0) self.form_data = { 'search.SearchName': 'express', 'search.SearchType': 'BusinessName', 'search.BusinessName': search, 'search.ActualBusinessName': search, 'search.BusinessId': '', 'search.SearchCriteria': 'StartsWith', 'search.CitizenShipType': '', 'search.BusinessTypeId': '0', 'search.BusinessStatusId': '', 'search.NaicsCode': '', 'search.businessStatus': '', 'search.isGoodStanding': '', 'search.Country': '', 'search.Zip': '', 'search.City': '', 'search.State': '', 'search.OtherState': '', 'search.PostalCode': '', 'search.AgentType': '', 'search.RAFirstName': '', 'search.RAMiddleName': '', 'search.RALastName': '', 'search.RASuffix': '', 'search.RAName': '', 'search.RAAddress1': '', 'search.RAAddress2': '', 'search.RACountry': '', 'search.RAZip': '', 'search.RACity': '', 'search.RAState': '', 'search.RAOtherState': '', 'search.RAPostalCode': '', 'search.DirectorFirstName': '', 'search.DirectorMiddleName': '', 'search.DirectorLastName': '', 'search.DirectorSuffix': '', 'search.IncorporatorType': '', 'search.IncorporatorFirstName': '', 'search.IncorporatorMiddleName': '', 'search.IncorporatorLastName': '', 'search.IncorporatorSuffix': '', 'search.IncorporatorEntityName': '', 'search.IncorporatorAddress1': '', 'search.IncorporatorAddress2': '', 'search.IncorporatorCountry': '', 'search.IncorporatorZip': '', 'search.IncorporatorCity': '', 'search.IncorporatorState': '', 'search.IncorporatorOtherState': '', 'search.IncorporatorPostalCode': '', 'search.OrganizerFirstName': '', 'search.OrganizerMiddleName': '', 'search.OrganizerLastName': '', 'search.OrganizerSuffix': '', 'search.ReservationNo': '', 'search.CaptchaResponse': self.getcaptchaCoder(self.site_key).resolver(response.url), } yield scrapy.FormRequest(response.url, formdata=self.form_data, dont_filter=True, method='POST', callback=self.parse_two, meta={'page': 2}) requestt = '' def parse_two(self, response): metaa = {} metaa = response.meta if metaa['page'] == 2: self.requestt = response.xpath( "//input[@name='__RequestVerificationToken']/@value" ).extract_first() main_tbl = response.xpath('//*[@id="xhtml_Businessesgrid"]//tr')[1:] for i in main_tbl: name_link = i.xpath('td[1]/a/@onclick').extract_first() metaa['id_number'] = re.search(r'\d+', str(name_link)).group() form_data = { 'txtCommonPageNo': '', 'hdnTotalPgCount': '129', 'txtCommonPageNo': '', 'hdnTotalPgCount': '2', 'businessId': str(metaa['id_number']), '__RequestVerificationToken': self.requestt, } yield scrapy.FormRequest( url= 'https://portal.sos.state.nm.us/BFS/online/CorporationBusinessSearch/CorporationBusinessInformation', formdata=form_data, callback=self.page_three, meta={ 'page': response.meta['page'], 'meta': metaa['id_number'] }) page = response.xpath( "//a[contains(@href,'xhtmlCorp.paging')][text()='Next >']/@href" ).extract() if page: formdata1 = { 'undefined': '', 'sortby': '', 'stype': 'a', 'pidx': str(response.meta['page']) } header = { 'Accept': '*/*', 'Host': 'portal.sos.state.nm.us', 'Origin': 'https://portal.sos.state.nm.us', 'Referer': 'https://portal.sos.state.nm.us/BFS/online/CorporationBusinessSearch', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', '__RequestVerificationToken': self.requestt } yield scrapy.FormRequest( url= 'https://portal.sos.state.nm.us/BFS/online/CorporationBusinessSearch/BusinessList', headers=header, method='POST', dont_filter=True, formdata=formdata1, callback=self.parse_two, meta={ 'page': response.meta['page'] + 1, 'metaa': metaa }) else: search = self.searchkeys.pop(0) self.form_data['search.BusinessName'] = search self.form_data['search.ActualBusinessName'] = search yield scrapy.FormRequest(response.url, formdata=self.form_data, dont_filter=True, method='POST', callback=self.parse_two, meta={'page': 2}) def page_three(self, response): print('------------------ inside::') meta = response.meta meta['company_name'] = response.xpath( "//td[contains(text(),'Entity Name:')]/following-sibling::td[1]/strong/text()" ).extract_first() meta['business_id'] = response.xpath( "//td[contains(text(),'Business ID#:')]/following-sibling::td[1]/strong/text()" ).extract_first() meta['dba_name'] = response.xpath( "//td[contains(text(),'DBA Name:')]/following-sibling::td[1]/strong/text()" ).extract_first() if meta['dba_name'] == 'Not Applicable': meta['dba_name'] = '' meta['company_subtype'] = response.xpath( "//td[contains(text(),'Entity Type:')]/following-sibling::td[1]/strong/text()" ).extract_first() if 'non profit' in meta['company_subtype'] or 'not profit' in meta[ 'company_subtype'] or 'Nonprofit' in meta[ 'company_subtype'] or 'Notprofit' in meta[ 'company_subtype']: meta['non_profit_indicator'] = 'Yes' else: meta['non_profit_indicator'] = '' meta['location_address_string'] = response.xpath( "//span[contains(text(),'Mailing Address:')]/ancestor::td/following-sibling::td/strong/text()" ).extract_first() meta['location_address_string'] = meta[ 'location_address_string'] if meta[ 'location_address_string'] else 'NM' meta['status'] = response.xpath( "//td[contains(text(),'Status:')]/following-sibling::td[1]/strong/text()" ).extract_first() meta['creation_date'] = response.xpath( "//td[contains(text(),'Date of Appointment:')]/following-sibling::td[1]/strong/text()" ).extract_first() meta['domestic_state'] = response.xpath( "//td[contains(text(),'State of Incorporation:')]/following-sibling::td[1]/strong/text()" ).extract_first() meta['peroid_of_duration'] = response.xpath( "//td[contains(text(),'Period of Duration:')]/following-sibling::td[1]/strong/text()" ).extract_first() meta['business_purpose'] = response.xpath( "//td[contains(text(),'Business Purpose:')]/following-sibling::td[1]/strong/text()" ).extract_first() office_info = response.xpath( "//table[@id='grid_OfficersList']//tr")[1:] for tr in office_info: meta['officer_title'] = tr.xpath('.//td[1]/text()').extract_first() meta['officer_name'] = tr.xpath('.//td[2]/text()').extract_first() meta['officer_address'] = tr.xpath( './/td[3]/text()').extract_first() if meta['officer_address'] == 'NONE' or meta[ 'officer_address'] == '': meta['officer_address'] = 'NM' if 'No Records to View' not in meta['officer_title']: yield self.save_to_csv(response, **meta).load_item() director_info = response.xpath('//*[@id="grid_DirectorList"]//tr')[1:] for dr in director_info: meta['officer_title'] = dr.xpath('.//td[1]/text()').extract_first() meta['officer_name'] = dr.xpath('.//td[2]/text()').extract_first() meta['officer_address'] = tr.xpath( './/td[3]/text()').extract_first() if meta['officer_address'] == 'NONE' or meta[ 'officer_address'] == '': meta['officer_address'] = 'NM' yield self.save_to_csv(response, **meta).load_item() agent_name = response.xpath( "//td[starts-with(text(),'Name:')]/following-sibling::td[1]/strong/text()" ).extract_first() meta['officer_name'] = agent_name agent_address = response.xpath( "//td[starts-with(text(),'Physical Address:')]/following-sibling::td[1]/strong/text()" ).extract_first() meta['officer_address'] = agent_address agent_subtype = 'Agent' meta['officer_title'] = agent_subtype yield self.save_to_csv(response, **meta).load_item() def save_to_csv(self, response, **meta): # self.state['items_count'] = self.state.get('items_count', 0) + 1 il = ItemLoader(item=NmSosSpiderItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) #il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('company_name', meta['company_name']) il.add_value('entity_id', meta['business_id']) il.add_value('dba_name', meta['dba_name']) il.add_value('company_subtype', meta['company_subtype']) il.add_value('non_profit_indicator', meta['non_profit_indicator']) il.add_value('location_address_string', meta['location_address_string']) il.add_value('status', meta['status']) il.add_value('creation_date', meta['creation_date']) il.add_value('domestic state', meta['domestic_state']) il.add_value('period of duration', meta['peroid_of_duration']) il.add_value('business purpose', meta['business_purpose']) il.add_value('mixed_subtype', meta['officer_title']) il.add_value('mixed_name', meta['officer_name']) il.add_value('person_address_string', meta['officer_address']) il.add_value('permit_type', 'business_license') il.add_value('sourceName', 'NM_SOS') il.add_value( 'url', 'https://portal.sos.state.nm.us/BFS/online/CorporationBusinessSearch' ) return il
class AlMedicalPersonLicensesSpider(CommonSpider): name = '1461_al_medical_person_licenses' allowed_domains = ['igovsolution.com'] start_urls = [ 'https://abme.igovsolution.com/online/Lookups/Individual_Lookup.aspx' ] site_key = '6LchcFEUAAAAAJdfnpZDr9hVzyt81NYOspe29k-x' custom_settings = { 'FILE_NAME': Utils.getRundateFileName( 'AI-1461_Licenses_Medical_Person_AL_CurationReady_'), 'JIRA_ID': 'AI_1461', 'HTTPCACHE_ENABLED': False, 'CONCURRENT_REQUESTS': 1, 'COOKIES_ENABLED': True, 'COOKIES_DEBUG': True, # 'JOBDIR' : CustomSettings.getJobDirectory('al_medical_person_licenses'), 'TOP_HEADER': { 'coq status': 'COQ status', 'location_address_string': 'Location', 'pa/crnp/cnm name': 'PA/CRNP/CNM Name', 'permit_lic_desc': 'License description', 'permit_lic_eff_date': 'Issue date', 'permit_lic_exp_date': 'Expiration date', 'permit_lic_no': 'License number', 'permit_lic_status': 'License status', 'permit_subtype': 'License type', 'permit_type': '', 'company_name': "Licensee name/physician's name", "physician's license": "Physician's License", 'practice type': 'Practice Type', 'ra/cp number': 'RA/CP Number', 'school name': 'School Name' }, 'FIELDS_TO_EXPORT': [ 'company_name', "physician's license", 'pa/crnp/cnm name', 'ra/cp number', 'location_address_string', 'permit_subtype', 'permit_lic_status', 'coq status', 'permit_lic_no', 'permit_lic_desc', 'permit_lic_eff_date', 'permit_lic_exp_date', 'practice type', 'school name', 'permit_type', 'sourceName', 'url', 'ingestion_timestamp' ], 'NULL_HEADERS': [ 'ra/cp number', 'coq status', 'practice type', 'pa/crnp/cnm name', 'school name', "physician's license" ] } search_element = [] def __init__(self, start=None, end=None, startnum=None, endnum=None, proxyserver=None, *a, **kw): super(AlMedicalPersonLicensesSpider, self).__init__(start, end, proxyserver=None, *a, **kw) self.search_element = SearchCriteria.strRange(start, end) def parse(self, response): yield scrapy.Request( response.urljoin('/online/JS_grd/Grid.svc/Verifycaptcha'), method='POST', body=json.dumps({ "resp": self.getcaptchaCoder(self.site_key).resolver(response.url), "uip": "" }), headers={'Content-Type': 'application/json'}, callback=self.verify_captcha, dont_filter=True) def dict(self, lic_typee): dict1 = { 'MD': 'Medical Doctor', 'DO': 'Doctor of Osteopathy', 'L': 'Limited MD or DO', 'PA': 'Physician Assistant', 'AA': 'Anesthesiologist Assistant', 'TA': 'temporary Physician Assistant', 'SP': 'Special Purpose(Tele Medicine)', 'RA': 'RA', 'CP': 'CP', 'ACSC': 'Alabama Controlled Substance Certificate', 'QACSC': 'Qualified Alabama Controlled Substance Certificate', 'QACSCNP': 'QACSCNP', 'LPSP': 'LPSP', 'RSV': 'RSV' } return dict1.get(lic_typee, "") def verify_captcha(self, response): if len(self.search_element) > 0: param = self.search_element.pop(0) print('--------------------------------------------parse', param) jsonresponse = json.loads(response.body_as_unicode()) formdata = { 'county': '-1', 'fname': '', 'lictype': '-1', 'lname': str(param), 'lnumber': '', 'page': '1', 'pageSize': '20', 'sdata': [], 'sortby': '', 'sortexp': '', 'vid': jsonresponse['d'], } yield scrapy.Request( response.urljoin('/online/JS_grd/Grid.svc/GetIndv_license'), method='POST', body=json.dumps(formdata), headers={'Content-Type': 'application/json'}, callback=self.getIndv_license, meta={ 'page': '2', 'vid': jsonresponse['d'], 'param': param }) @inline_requests def getIndv_license(self, response): meta = response.meta person_name = '' detail_page = '' aa = json.loads(json.loads(response.body_as_unicode())['d']) val = json.loads(json.loads( response.body_as_unicode())['d'])['Response'] temp_name = '' location_address_string_temp = 'Alabama' if val: for item_ in json.loads(val): lic_id = item_['App_ID'] lic_type = item_['License_Type'] temp_name = item_['Name'] if lic_type == 'MD' or lic_type == 'DO' or lic_type == 'L' or lic_type == 'SP': detail_page = yield scrapy.Request( url= 'https://abme.igovsolution.com/online/ABME_Prints/Print_MD_DO_Laspx.aspx?appid=' + str(lic_id), method='GET') elif lic_type == 'PA' or lic_type == 'TA': detail_page = yield scrapy.Request( url= 'https://abme.igovsolution.com/online/ABME_Prints/Print_PA_TA.aspx?appid=' + str(lic_id), method='GET') elif lic_type == 'AA': detail_page = yield scrapy.Request( url= 'https://abme.igovsolution.com/online/ABME_Prints/Print_AA.aspx?appid=' + str(lic_id), method='GET') elif lic_type == 'RA': detail_page = yield scrapy.Request( url= 'https://abme.igovsolution.com/online/ABME_Prints/Print_RA.aspx?appid=' + str(lic_id), method='GET') elif lic_type == 'CP': detail_page = yield scrapy.Request( url= 'https://abme.igovsolution.com/online/ABME_Prints/Print_CP.aspx?appid=' + str(lic_id), method='GET') elif lic_type == 'ACSC': detail_page = yield scrapy.Request( url= 'https://abme.igovsolution.com/online/ABME_Prints/Print_other.aspx?appid=' + str(lic_id), method='GET') elif lic_type == 'QASC' or lic_type == 'QACSCNP': detail_page = yield scrapy.Request( url= 'https://abme.igovsolution.com/online/ABME_Prints/print_Quality_CSC.aspx?appid=' + str(lic_id), method='GET') elif lic_type == 'LPSP': detail_page = yield scrapy.Request( url= 'https://abme.igovsolution.com/online/ABME_Prints/Print_LPSPaspx.aspx?appid=' + str(lic_id), method='GET') person_name1 = detail_page.xpath( "//span[contains(text(),'Licensee name')]/ancestor::div/following-sibling::div/span/text()" ).extract_first() person_name = person_name1 if person_name1 else temp_name permit_subtype = self.dict(lic_type) textt = ("physician's license:" or "Physician's License:") physician_lic = detail_page.xpath( '//span[contains(text(),"' + textt + '")]/ancestor::div/following-sibling::div/span/text()' ).extract_first() pa_cp_ra_name = detail_page.xpath( "//span[contains(text(),'PA/CRNP/CNM Name:')]/ancestor::div/following-sibling::div/span/text()" ).extract_first() ra_cp_number = detail_page.xpath( "//span[contains(text(),'RA/CP Number:')]/ancestor::div/following-sibling::div/span/text()" ).extract_first() location_address_string1 = str( detail_page.xpath( "//span[contains(text(),'Location:')]/ancestor::div/following-sibling::div/span/text()" ).extract_first()).strip() location_address_string = location_address_string1 if location_address_string1 else location_address_string_temp permit_lic_status = detail_page.xpath( "//span[contains(text(),'License status:')]/ancestor::div/following-sibling::div/span/text()" ).extract_first() coq_status = detail_page.xpath( "//span[contains(text(),'COQ status:')]/ancestor::div/following-sibling::div/span/text()" ).extract_first() permit_lic_number = detail_page.xpath( "//span[contains(text(),'License number:')]/ancestor::div/following-sibling::div/span/text()" ).extract_first() permit_lic_desc = '' permit_lic_desc = detail_page.xpath( "//span[contains(text(),'License description:')]/ancestor::div/following-sibling::div/span/text()" ).extract_first() if not permit_lic_desc: permit_lic_desc = detail_page.xpath( "//span[contains(text(),'Description:')]/ancestor::div/following-sibling::div/span/text()" ).extract_first() permit_lic_eff_date = detail_page.xpath( "//span[contains(text(),'Issue date:')]/ancestor::div/following-sibling::div/span/text()" ).extract_first() permit_lic_exp_date = detail_page.xpath( "//span[contains(text(),'Expiration date:')]/ancestor::div/following-sibling::div/span/text()" ).extract_first() practice_type = detail_page.xpath( "//span[contains(text(),'Practice Type')]/ancestor::div/following-sibling::div/span/text()" ).extract_first() school_name = detail_page.xpath( "//span[contains(text(),'School Name:')]/ancestor::div/following-sibling::div/span/text()" ).extract_first() # print("===================================page======================",person_name) il = ItemLoader(item=AlMedicalPersonLicensesSpiderItem(), response=response) il.default_input_processor = MapCompose( lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value( 'url', 'https://abme.igovsolution.com/online/Lookups/Individual_Lookup.aspx' ) il.add_value('sourceName', 'AL_Medical_Person_Licenses') il.add_value('company_name', person_name) il.add_value("physician's license", physician_lic) il.add_value('pa/crnp/cnm name', pa_cp_ra_name) il.add_value('ra/cp number', ra_cp_number) il.add_value( 'location_address_string', location_address_string if location_address_string and len(location_address_string) > 4 else 'AL') il.add_value('permit_lic_status', permit_lic_status) il.add_value('coq status', coq_status) il.add_value('permit_lic_no', permit_lic_number) il.add_value( 'permit_lic_desc', permit_lic_desc if permit_lic_desc else 'medical_license') il.add_value('permit_lic_eff_date', permit_lic_eff_date) il.add_value('permit_lic_exp_date', permit_lic_exp_date) il.add_value('practice type', practice_type) il.add_value('school name', school_name) il.add_value('permit_subtype', permit_subtype) il.add_value('permit_type', 'medical_license') yield il.load_item() total = aa['reccount'] #page_count page = response.meta['page'] print("8888888888888888888888888888888888888888888888888888888", page) vid = response.meta['vid'] param = response.meta['param'] if total: page_no = (int(total) / 20) + 2 if int(page_no) > int(page): headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Content-Type': 'application/json; charset=UTF-8', 'Origin': 'https://abme.igovsolution.com', 'Referer': 'https://abme.igovsolution.com/online/Lookups/Individual_Lookup.aspx', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', } body = json.loads( json.dumps( '{"lnumber":"","lname":"' + str(param) + '","fname":"","lictype":"-1","county":"-1","vid":"' + str(vid) + '","pageSize":20,"page":' + str(page) + ',"sortby":"","sortexp":"","sdata":[]}')) yield scrapy.FormRequest( url= 'https://abme.igovsolution.com/online/JS_grd/Grid.svc/GetIndv_license', method='POST', headers=headers, body=body, callback=self.getIndv_license, meta={ 'page': int(page) + 1, 'vid': vid, 'param': param }) elif len(self.search_element) > 0: print("-------------------------------------------", self.search_element) time.sleep(120) yield scrapy.Request(url=self.start_urls[0], callback=self.parse, dont_filter=True)
class KyLiquorLicensesSpider(CommonSpider): name = '1286_ky_liquor_licenses' # allowed_domains = ['ky.gov'] start_urls = ['https://www.aitrg.com'] pro_urls = [] custom_settings = { 'FILE_NAME': Utils.getRundateFileName('AI-1286_Licenses_Liquor_KY_CurationReady'), 'JIRA_ID': 'AI_1286', 'DOWNLOAD_DELAY': .2, 'COOKIES_ENABLED': True, 'COOKIES_DEBUG': True, 'HTTPCACHE_ENABLED': False, # 'AJAXCRAWL_ENABLED':False, # 'JOBDIR' : CustomSettings.getJobDirectory('ky_liquor_licenses'), 'TOP_HEADER': { 'company_name': 'Licensee Name', 'company_phone': 'Premises Phone', 'company_subtype': 'Business Type', 'dba_name': 'DBA', 'effective date': 'Effective Date', 'is licensee owner': 'Is Licensee Owner', 'licensing county': 'Licensing County', 'location_address_string': 'Site Address + City + State + Zip', 'mail_address_string': 'Mailing Address + City + State + Zip', 'permit_lic_desc': '', 'permit_lic_eff_date': 'Issue Date', 'permit_lic_exp_date': 'Expire Date', 'permit_lic_no': 'License Number', 'permit_lic_status': 'Status', 'permit_subtype': 'License Type', 'permit_type': '', 'person_name': 'PARTNERS', 'real estate owner': 'Real Estate Owner', 'restrictions': 'Restrictions', 'site id': 'Site Id' }, 'FIELDS_TO_EXPORT': [ 'site id', 'location_address_string', 'licensing county', 'company_phone', 'company_name', 'dba_name', 'company_subtype', 'mail_address_string', 'person_name', 'real estate owner', 'is licensee owner', 'permit_subtype', 'permit_lic_desc', 'permit_lic_no', 'permit_lic_status', 'permit_lic_eff_date', 'effective date', 'permit_lic_exp_date', 'restrictions', 'permit_type', 'url', 'sourceName', 'ingestion_timestamp' ], 'NULL_HEADERS': [ 'site id', 'licensing county', 'real estate owner', 'is licensee owner', 'effective date', 'restrictions' ] } # @inline_requests def parse(self, response): module_dir = os.path.dirname(os.path.realpath(__file__)) paths = [ module_dir + '/AI-1286/file1.csv', module_dir + '/AI-1286/file2.csv', module_dir + '/AI-1286/file3.csv', module_dir + '/AI-1286/file4.csv', module_dir + '/AI-1286/file5.csv', module_dir + '/AI-1286/file6.csv', ] # yield scrapy.Request(url='file://'+path,callback=self.parse_rows,dont_filter=True) import csv # path=r'/Users/imac/Downloads/Permits_Buildings_FL_Highlands_CurationReady_20181004_v12.csv' # for path in paths[int(self.start):int(self.end)]: # print('path====>',path) with open(module_dir + '/file1.csv', errors='ignore') as csvfile: readCSV = csv.reader(csvfile, delimiter='\t') count = 0 for row in readCSV: count += 1 if count <= 2: continue if len(row) < 5: break print('==============>', len(row)) # print(row) # def parse_row(self,response,row): il = ItemLoader(item=KyLiquorLicensesSpiderItem()) il.default_input_processor = MapCompose( lambda v: v.strip(), remove_tags, lambda data: re.sub(r'\s+', ' ', data) if data else '', replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'KY_Liquor_Licenses') il.add_value( 'url', 'https://dppweb.ky.gov/ABCSTAR27161/page/License_Lookup/portal.aspx' ) il.add_value('permit_type', 'liquor_license') # # il.add_value('unique_id','') # for k in data_dic: # il.add_value(k,data_dic[k]) # return il il.add_value('site id', row[0]) il.add_value('location_address_string', row[1]) il.add_value('licensing county', row[2]) il.add_value('company_phone', row[3]) il.add_value('company_name', row[4]) il.add_value('dba_name', row[5]) il.add_value('company_subtype', row[6]) il.add_value('mail_address_string', row[7]) il.add_value('person_name', row[8]) il.add_value('real estate owner', row[9]) il.add_value('is licensee owner', row[10]) il.add_value('permit_subtype', row[11]) il.add_value('permit_lic_desc', row[12]) il.add_value('permit_lic_no', row[13]) il.add_value('permit_lic_status', row[14]) il.add_value('permit_lic_eff_date', row[15]) il.add_value('effective date', row[16]) il.add_value('permit_lic_exp_date', row[17]) il.add_value('restrictions', row[18]) # il.add_value('permit_subtype', row['permit_subtype']) yield il.load_item()
class AlCosmetologyLicensesSpider(CommonSpider): name = '1478_al_cosmetology_licenses' allowed_domains = ['glsuite.us'] start_urls = ['https://alboc.glsuite.us/GLSuiteWeb/Clients/ALBOC/public/VerificationSearch.aspx'] custom_settings = { 'FILE_NAME':Utils.getRundateFileName('AI-1478_Licenses_Cosmetology_AL_CurationReady'), 'JIRA_ID':'AI_1478', 'HTTPCACHE_ENABLED':False, 'CONCURRENT_REQUESTS': 1, 'COOKIES_ENABLED':True, 'COOKIES_DEBUG':True, 'DOWNLOAD_DELAY':4, # 'JOBDIR' : CustomSettings.getJobDirectory('al_cosmetology_licenses'), 'TOP_HEADER':{'company_name': 'Salon Name','disciplinary action': 'Disciplinary Action','location_address_string': 'Address','permit_lic_desc': '','permit_lic_exp_date': 'License Expiration Date','permit_lic_no': 'License Number','permit_lic_status': 'License Status','permit_subtype': 'License Type','permit_type': '','person_name': 'Name','violation_type': ''}, 'FIELDS_TO_EXPORT':['person_name','company_name','location_address_string','permit_lic_no','permit_subtype','permit_lic_exp_date','permit_lic_status','disciplinary action','violation_type','permit_lic_desc','permit_type','url','sourceName', 'ingestion_timestamp'], 'NULL_HEADERS':['disciplinary action']} def __init__(self, start=None, end=None,startnum=None,endnum=None, proxyserver=None, *a, **kw): super(AlCosmetologyLicensesSpider, self).__init__(start,end, proxyserver=None,*a, **kw) self.search_element = SearchCriteria.strRange(self.start,self.end) search_element = [] search_element_a=[] check_first = True def parse(self, response): if self.check_first: self.check_first = False self.search_element_a = SearchCriteria.strRange(self.starta,self.enda) self.search_element1='*'+str(self.search_element.pop(0)) if len(self.search_element_a) > 0: val = '*'+str(self.search_element_a.pop(0)) form_data={'ctl00$ContentPlaceHolder1$txtLastName': str(self.search_element1), 'ctl00$ContentPlaceHolder1$txtShopName': str(val), '__EVENTTARGET': 'ctl00$ContentPlaceHolder1$btnLicenseeSubmit'} head={'Connection': 'keep-alive' , 'Host': 'alboc.glsuite.us', 'Origin': 'https://alboc.glsuite.us','Upgrade-Insecure-Requests': '1' , 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' , 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application signed-exchange;v=b3', 'Referer': 'https://alboc.glsuite.us/GLSuiteWeb/Clients/ALBOC/public/VerificationSearch.aspx', 'Accept-Encoding': 'gzip, deflate, br','Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8' } yield scrapy.FormRequest.from_response(response,headers=head,formdata=form_data,formid='aspnetForm',method="POST",dont_filter=True,callback=self.parse_details) @inline_requests def parse_details(self, response): tr_list=response.xpath('//*[@id="ctl00_ContentPlaceHolder1_dtgResults"]//tr')[1:] for tr in tr_list: link=tr.xpath('td[10]/a/@href').extract_first() company_name=tr.xpath('td[4]/text()').extract_first() f_name=tr.xpath('td[1]/text()').extract_first() m_name=tr.xpath('td[2]/text()').extract_first() l_name=tr.xpath('td[3]/text()').extract_first() person_name=self.format_name(f_name,m_name,l_name) if company_name and len(company_name) > 2: company_name=company_name else: company_name=person_name if link: link_url='https://alboc.glsuite.us/GLSuiteWeb/Clients/ALBOC/public/'+str(link) parse_res=yield scrapy.Request(url=link_url,dont_filter=True) add=parse_res.xpath('//*[contains(text(),"City")]/following-sibling::td/span/text()').extract_first() state=parse_res.xpath('//*[contains(text(),"State")]/following-sibling::td/span/text()').extract_first() if add and state: location_address_string=add+', '+state else: location_address_string=state permit_lic_no=parse_res.xpath('//*[contains(text(),"License Number")]/following-sibling::td/span/text()').extract_first() permit_subtype=parse_res.xpath('//*[contains(text(),"License Type")]/following-sibling::td/span/text()').extract_first() permit_lic_exp_date=parse_res.xpath('//*[contains(text(),"License Expiration Date")]/following-sibling::td/span/text()').extract_first() permit_lic_status=parse_res.xpath('//*[contains(text(),"License Status")]/following-sibling::td/span/text()').extract_first() disciplinary_action=parse_res.xpath('//*[contains(text(),"Disciplinary Action")]/following-sibling::td/span/text()').extract_first() il = ItemLoader(item=AlCosmetologyLicensesSpiderItem(),response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('url', 'https://alboc.glsuite.us/GLSuiteWeb/Clients/ALBOC/public/VerificationSearch.aspx') il.add_value('sourceName', 'AL_Cosmetology_Licenses') il.add_value('permit_lic_exp_date',permit_lic_exp_date) il.add_value('permit_lic_status',permit_lic_status) il.add_value('person_name', person_name) il.add_value('violation_type', '') il.add_value('disciplinary action', disciplinary_action) il.add_value('permit_lic_desc', ('Cosmetology License for'+' '+str(company_name)) if company_name and len(company_name)>2 else 'Cosmetology License') il.add_value('permit_type', 'cosmetology_license') il.add_value('location_address_string', location_address_string if location_address_string and len(location_address_string) > 2 else 'AL') il.add_value('permit_lic_no', permit_lic_no) il.add_value('company_name', company_name) il.add_value('permit_subtype', permit_subtype) yield il.load_item() pageee=response.xpath('//td[@colspan="10"]/span/following-sibling::a/@href').extract_first() if pageee: page_link=JavaScriptUtils.getValuesFromdoPost(pageee) page_data={'__EVENTTARGET':page_link['__EVENTTARGET'],'__EVENTARGUMENT':page_link['__EVENTARGUMENT'],'__VIEWSTATE':response.xpath('//*[@id="__VIEWSTATE"]/@value').extract_first(), '__VIEWSTATEGENERATOR':response.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value').extract_first(),'__EVENTVALIDATION':response.xpath('//*[@id="__EVENTVALIDATION"]/@value').extract_first(),'__VIEWSTATEENCRYPTED':response.xpath('//*[@id="__VIEWSTATEENCRYPTED"]/@value').extract_first()} yield scrapy.FormRequest(url=response.url,method='POST',formdata=page_data,callback=self.parse_details,dont_filter=True) elif len(self.search_element_a)>0: yield scrapy.Request(url=self.start_urls[0], callback=self.parse, dont_filter=True) elif len(self.search_element)>0: self.check_first=True yield scrapy.Request(url=self.start_urls[0], callback=self.parse, dont_filter=True) def format_name(self, f_name, m_name, l_name): return " ".join([y.strip() for y in [f_name, " ".join([i for i in [m_name, l_name] if i])] if y])
class AlFoodInspectionsSpider(CommonSpider): name = '1464_al_food_inspections' allowed_domains = ['alabamapublichealth.gov'] start_urls=['http://foodscores.state.al.us'] custom_settings = { 'FILE_NAME':Utils.getRundateFileName('AI-1464_Inspections_Food_AL_CurationReady'), 'JIRA_ID':'AI_1464', 'COOKIES_ENABLED':True, 'DOWNLOAD_DELAY':5, # 'CONCURRENT_REQUESTS': 1, 'COOKIES_DEBUG':True, 'HTTPCACHE_ENABLED':False, # 'JOBDIR' : CustomSettings.getJobDirectory('al_food_inspections'), 'TOP_HEADER':{'company_name': 'Establishment Name', 'county': 'County', 'dba_name': '', 'inspection_score': 'Score', 'inspection_date': 'Inspection date', 'inspection_type': '', 'location_address_string': 'Address', 'smoke free': 'Smoke Free'}, 'FIELDS_TO_EXPORT':['county', 'company_name','dba_name', 'location_address_string','smoke free', 'inspection_type', 'inspection_date', 'inspection_score', 'sourceName','url', 'ingestion_timestamp', ], 'NULL_HEADERS':['county', 'smoke free'] } def parse(self, response): self.response=response # res_data=response self.county_list=response.xpath("//select[@id='ctl00_ContentPlaceHolder1_DrpCnty']/option/@value").extract()[1:] self.county=response.xpath("//select[@id='ctl00_ContentPlaceHolder1_DrpCnty']/option/text()").extract()[1:] # self.county_list=['36','45'] print(self.county_list) if self.county_list: self.county_data=self.county.pop(0) self.county_pop=self.county_list.pop(0) formdata={ "ctl00$ScriptManager1": "ctl00$UpdatePanel1|ctl00$ContentPlaceHolder1$BtnSearch", "ctl00$ContentPlaceHolder1$TxtEstdNm": "", "ctl00$ContentPlaceHolder1$DrpEstdType": "All", "ctl00$ContentPlaceHolder1$txtCity": "", "ctl00$ContentPlaceHolder1$DrpCnty": str(self.county_pop), "__LASTFOCUS": "", "__VIEWSTATE": response.xpath('//*[@id="__VIEWSTATE"]/@value').extract_first(), "__VIEWSTATEGENERATOR": response.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value').extract_first(), "__EVENTTARGET": "", "__EVENTARGUMENT": "", "__EVENTVALIDATION": response.xpath('//*[@id="__EVENTVALIDATION"]/@value').extract_first(), "ctl00$ContentPlaceHolder1$BtnSearch.x": "60", } header={ "Accept": '*/*', "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "Host": "foodscores.state.al.us", "Origin": "http://foodscores.state.al.us", "Referer": response.url, "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36", } print(header) yield scrapy.FormRequest.from_response(response,callback=self.parse_next1,dont_filter=True,formdata=formdata,headers=header,dont_click=True,errback=self.errors,meta={"max_retry_times":3}) def parse_next1(self,response): # inspect_response(response,self) res = HtmlResponse(response.url, body=str.encode(response.text)) dic={} self.response=res table=res.xpath("//table[@id='ctl00_ContentPlaceHolder1_DtList']//tr") for i,row in enumerate(table): v=(str(i).zfill(2)) company_name=res.xpath("//span[@id='ctl00_ContentPlaceHolder1_DtList_ctl"+str(v)+"_LblEst']/text()").extract_first() dic['company_name']=self.dba_format(company_name)[0] dic['dba_name']=self.dba_format(company_name)[1] city=res.xpath("//span[@id='ctl00_ContentPlaceHolder1_DtList_ctl"+str(v)+"_Label1']/text()").extract_first() address=res.xpath("//a[@id='ctl00_ContentPlaceHolder1_DtList_ctl"+str(v)+"_LnkAdd']/text()").extract_first() zipcode=res.xpath("//span[@id='ctl00_ContentPlaceHolder1_DtList_ctl"+str(v)+"_Label2']/text()").extract_first() dic['location_address_string']='' if address: dic['location_address_string']+=address+', ' if city: dic['location_address_string']+=city+' ' if zipcode: dic['location_address_string']+=zipcode dic['smoke free']=res.xpath("//span[@id='ctl00_ContentPlaceHolder1_DtList_ctl"+str(v)+"_LblSmoke']/text()").extract_first() dic['inspection_score']=res.xpath("//span[@id='ctl00_ContentPlaceHolder1_DtList_ctl"+str(v)+"_LblScore']/text()").extract_first() dic['inspection_date']=res.xpath("//span[@id='ctl00_ContentPlaceHolder1_DtList_ctl"+str(v)+"_LblInDt']/text()").extract_first() dic['inspection_type']='health_inspection' if dic['inspection_date'] else '' dic['county']=self.county_data yield self.save_csv(response,dic).load_item() if self.county_list: self.county_data=self.county.pop(0) self.county_pop=self.county_list.pop(0) print("\n\n\n") print(self.county_data) print("\n\n\n") formdata={ "ctl00$ScriptManager1": "ctl00$UpdatePanel1|ctl00$ContentPlaceHolder1$BtnSearch", "ctl00$ContentPlaceHolder1$TxtEstdNm": "", "ctl00$ContentPlaceHolder1$DrpEstdType": "All", "ctl00$ContentPlaceHolder1$txtCity": "", "ctl00$ContentPlaceHolder1$DrpCnty": str(self.county_pop), "__LASTFOCUS": "", "__VIEWSTATE": response.xpath('//*[@id="__VIEWSTATE"]/@value').extract_first(), "__VIEWSTATEGENERATOR": response.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value').extract_first(), "__EVENTTARGET": "", "__EVENTARGUMENT": "", "__EVENTVALIDATION": response.xpath('//*[@id="__EVENTVALIDATION"]/@value').extract_first(), "ctl00$ContentPlaceHolder1$BtnSearch.x": "60", } header={ "Accept": '*/*', "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "Host": "foodscores.state.al.us", "Origin": "http://foodscores.state.al.us", "Referer": response.url, "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36", } yield scrapy.FormRequest.from_response(response,callback=self.parse_next1,dont_filter=True,formdata=formdata,headers=header,dont_click=True,errback=self.errors,meta={"max_retry_times":3}) def save_csv(self,response,data_dic): il = ItemLoader(item=AlFoodInspectionsSpiderItem(),response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags,lambda data:re.sub(r'\s+', ' ',data) if data else '',replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'AL_Food_Inspections') il.add_value('url', 'http://www.alabamapublichealth.gov/foodscores/index.html') for k in data_dic: il.add_value(k,(data_dic[k])) return il def dba_format(self,name): if name: if "[" in name: name_replace=name if 'DBA' in name_replace: split_name_replace=name_replace.split('[') if len(split_name_replace)>1: if 'DBA' in (split_name_replace)[1]: b= ((split_name_replace)[1].replace('DBA','').replace(']','').strip()) return [split_name_replace[0],b] else: return self._getDBA1(name_replace) else: return self._getDBA1(' '.join(split_name_replace)) else: return self._getDBA1(name_replace) else: if "(" in name: name_replace=name if 'DBA' in name_replace: split_name_replace=name_replace.split('(') if len(split_name_replace)>1: if 'DBA' in (split_name_replace)[1]: b= ((split_name_replace)[1].replace('DBA','').replace(')','').strip()) return (split_name_replace[0],b) else: return self._getDBA1(name_replace) else: return self._getDBA1(' '.join(split_name_replace)) else: return self._getDBA1(name_replace) else: return self._getDBA1(name) else: return self._getDBA1(name) def errors(self,response): print("\n\n\n") print('error in this page:',self.county_data) if self.county_list: self.county_data=self.county.pop(0) self.county_pop=self.county_list.pop(0) print("\n\n\n") print(self.county_data) print("\n\n\n") formdata={ "ctl00$ScriptManager1": "ctl00$UpdatePanel1|ctl00$ContentPlaceHolder1$BtnSearch", "ctl00$ContentPlaceHolder1$TxtEstdNm": "", "ctl00$ContentPlaceHolder1$DrpEstdType": "All", "ctl00$ContentPlaceHolder1$txtCity": "", "ctl00$ContentPlaceHolder1$DrpCnty": str(self.county_pop), "__LASTFOCUS": "", "__VIEWSTATE": self.response.xpath('//*[@id="__VIEWSTATE"]/@value').extract_first(), "__VIEWSTATEGENERATOR": self.response.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value').extract_first(), "__EVENTTARGET": "", "__EVENTARGUMENT": "", "__EVENTVALIDATION": self.response.xpath('//*[@id="__EVENTVALIDATION"]/@value').extract_first(), "ctl00$ContentPlaceHolder1$BtnSearch.x": "60", } header={ "Accept": '*/*', "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "Host": "foodscores.state.al.us", "Origin": "http://foodscores.state.al.us", "Referer": self.response.url, "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36", } yield scrapy.FormRequest.from_response(self.response,callback=self.parse_next1,dont_filter=True,formdata=formdata,headers=header,dont_click=True,errback=self.errors,meta={"max_retry_times":3}) def _getDBA1(self, person_name): if person_name: person_name=re.sub('doingbusiness|doingbusiness','',re.sub(r" Dba | DBA |D/B/A|d/b/a| DBA|D B A | D B A| D B A | d b a| d b a |d b a |DBA | dba|dba |\(dba\)|dba:|\(DBA\)|D/b/a|/DBA|D/b/A|/dba|/dba/|dba/|/DBA/|DBA/|-DBA|DBA.|/DBA",' dba ',person_name)) if re.search(' dba ', person_name, flags=re.IGNORECASE): name = person_name.split('dba')[0] dba_name = person_name.split('dba')[1] return name, dba_name return (person_name, '') # # # self.state['items_count'] = self.state.get('items_count', 0) + 1 # il = ItemLoader(item=AlFoodInspectionsSpiderItem(),response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) # il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) # il.add_value('sourceName', 'AL_Food_Inspections') # il.add_value('url', 'http://www.alabamapublichealth.gov/foodscores/index.html') # il.add_xpath('dba_name', '') # il.add_xpath('county', '') # il.add_xpath('location_address_string', '') # il.add_xpath('company_name', '') # il.add_xpath('smoke free', '') # il.add_xpath('inspection_Score', '') # il.add_xpath('inspection_type', '') # il.add_xpath('inspection_date', '') # return il.load_item()
class CtSecurityServiceLicensesSpider(CommonSpider): name = 'ai_1128_ct_security_service_licenses' allowed_domains = ['ct.gov'] start_urls = [ 'https://portal.ct.gov/-/media/DESPP/files/bondsmanpdf.pdf?la=en' ] custom_settings = { 'FILE_NAME': Utils.getRundateFileName( 'AI-1128_Licenses_Security_Service_CT_CurationReady'), 'JIRA_ID': 'AI_1128', 'HTTPCACHE_ENABLED': False, 'COOKIES_ENABLED': True, 'COOKIES_DEBUG': True, # 'JOBDIR' : CustomSettings.getJobDirectory('ct_security_service_licenses'), 'TOP_HEADER': { 'bail limit': 'Bail Limit', 'company_name': '', 'dba_name': '', 'location_address_string': 'Address', 'permit_lic_desc': '', 'permit_lic_exp_date': 'Exp Date', 'permit_lic_no': 'Lic No.', 'permit_lic_status': 'Status', 'permit_subtype': 'Type', 'permit_type': '', 'person_name': 'Agent/Instructors', 'person_phone': 'Phone', 'person_subtype': '' }, 'FIELDS_TO_EXPORT': [ 'permit_subtype', 'company_name', 'person_name', 'dba_name', 'person_subtype', 'permit_lic_no', 'bail limit', 'permit_lic_status', 'permit_lic_exp_date', 'location_address_string', 'person_phone', 'permit_lic_desc', 'permit_type', 'url', 'sourceName', 'ingestion_timestamp' ], 'NULL_HEADERS': ['bail limit'] } def _getPersonName(self, mixed_name): if mixed_name != None and ',' in mixed_name: diff = mixed_name.split(',') fir_name, sec_name = diff[0], diff[1] mixed_name = ' '.join([sec_name, fir_name]) elif mixed_name == None: mixed_name = '' return mixed_name.strip() def parse(self, response): file1 = 'https://portal.ct.gov/-/media/DESPP/files/bondsmanpdf.pdf?la=en' file2 = 'https://portal.ct.gov/-/media/DESPP/SLFU/bea/BEALIST82017pdf.pdf?la=en' file7 = 'https://portal.ct.gov/-/media/DESPP/SLFU/bondsman/BondsmanFirearmInstructorspdf.pdf?la=en' #------->OCR format convert and give file path file3 = '/home/ait-python/Downloads/pdf (2)/Licensed-Private-Detectives-and-Security-Companies.pdf' file4 = '/home/ait-python/Downloads/pdf (2)/bluecardinstructorpdf.pdf' file5 = '/home/ait-python/Downloads/pdf (2)/beainstructor032018pdf.pdf' file6 = '/home/ait-python/Downloads/pdf (2)/Public-Security-Instructors.pdf' #---------> if file1: df = tabula.read_pdf( file1, pages='1', area=[110.635, 11.385, 595.485, 768.735], columns=[ 170.775, 218.295, 279.675, 329.175, 395.505, 526.185, 636.075, 668.745, 700.355, 764.775 ], silent=True, # stream=True, # multiple_tables=True, guess=False, encoding='utf-8', pandas_option={'header': None}) for _, row in df.fillna('').iterrows(): row = row.tolist() old_name = str(row[0]).split(',') name = old_name[1] + ' ' + old_name[0] print(name) permit_lic_no = row[1] bail_limit = str(row[2]) permit_lic_status = row[3] permit_lic_exp_date = row[4] address = str(row[5]) city = str(row[6]) state = str(row[7]) zippp = str(row[8]) print("===============", zippp) location_address_string = self.format__address_4( address, city, state, zippp) phone = str(row[9]).replace('.0', '') permit_subtype = 'Bail Bondsman' company_name = name il = ItemLoader(item=CtSecurityServiceLicensesSpiderItem()) il.add_value( 'url', 'https://portal.ct.gov/-/media/DESPP/files/bondsmanpdf.pdf?la=en' ) il.add_value('sourceName', 'CT_Security_Service_Licenses') il.add_value('permit_subtype', permit_subtype) il.add_value('company_name', name) il.add_value('person_name', name) il.add_value('dba_name', '') il.add_value('person_subtype', 'Agent') il.add_value('permit_lic_no', permit_lic_no) il.add_value('bail limit', bail_limit) il.add_value('permit_lic_status', permit_lic_status) il.add_value('permit_lic_exp_date', permit_lic_exp_date) il.add_value('location_address_string', location_address_string) il.add_value('person_phone', phone) il.add_value('permit_lic_desc', permit_subtype) il.add_value('permit_type', 'security_license') yield il.load_item() if file2: df = tabula.read_pdf( file2, pages='all', area=[30.175, 42.57, 584.595, 773.19], columns=[ 182.16, 233.64, 295.02, 369.27, 487.08, 600.93, 642.51, 681.12, 757.35 ], # spreadsheet=True, silent=True, stream=True, # multiple_tables=True, # guess = True, encoding='ISO-8859-1', pandas_option={'header': None}) for _, row in df.fillna('').iterrows(): row = row.tolist() old_name = str(row[0]).split(',') name = old_name[1] + ' ' + old_name[0] # print(old_name) permit_lic_no = str(row[1]) permit_lic_status = str(row[2]) permit_lic_exp_date = str(row[3]) address = str(row[4]) city = str(row[5]) state = str(row[6]) zippp = str(row[7]) phone = str(row[8]).replace('.0', '') location_address_string = self.format__address_4( address, city, state, zippp) permit_subtype = 'Bail enforcement Agents' il = ItemLoader(item=CtSecurityServiceLicensesSpiderItem()) il.add_value( 'url', 'https://portal.ct.gov/-/media/DESPP/files/bondsmanpdf.pdf?la=en' ) il.add_value('sourceName', 'CT_Security_Service_Licenses') il.add_value('permit_subtype', permit_subtype) il.add_value('company_name', name) il.add_value('person_name', name) il.add_value('dba_name', '') il.add_value('person_subtype', 'Agent') il.add_value('permit_lic_no', permit_lic_no) il.add_value('bail limit', '') il.add_value('permit_lic_status', permit_lic_status) il.add_value('permit_lic_exp_date', permit_lic_exp_date) il.add_value('location_address_string', location_address_string) il.add_value('person_phone', phone) il.add_value('permit_lic_desc', 'Bail enforcement Agents') il.add_value('permit_type', 'security_license') yield il.load_item() if file3: def __extractData(self, response): def rolling_group(val): if pd.notnull(val): # if pd.notnull(val) and '/' in val and not 'st' in val: rolling_group.group += 1 return rolling_group.group rolling_group.group = 0 def joinFunc(g, column): col = g[column] joiner = "/" s = joiner.join( [str(each) for each in col if pd.notnull(each)]) s = re.sub("(?<=&)" + joiner, " ", s) s = re.sub("(?<=-)" + joiner, " ", s) s = re.sub(joiner * 2, joiner, s) return s def getDf(file3, area): return tabula.read_pdf( file3, pages='all', area=[45.045, 29.7, 582.615, 781.11], columns=[ 217.8, 268.29, 326.7, 375.16, 532.62, 617.76, 670.23, 699.93, 779.13 ], guess=False, encoding='ISO-8859-1', pandas_option={'header': None}) df = getDf(file3, [45.045, 29.7, 582.615, 781.11]) df.columns = [ 'company', 'lic_no', 'status', 'exp_date', 'street', 'city', 'state', 'zip', 'phone' ] groups = df.groupby(df['status'].apply(rolling_group), as_index=False) groupFunct = lambda g: pd.Series( [joinFunc(g, col) for col in g.columns], index=g.columns) final_df = groups.apply(groupFunct).fillna('') yield final_df.to_dict('records') for col in __extractData(self, response): # print(col) for row in col: company_name = str(row['company']) permit_lic_no = row['lic_no'] permit_lic_status = row['status'] permit_lic_exp_date = row['exp_date'] address = str(row['street']) city = str(row['city']) state = str(row['state']) zippp = str(row['zip']) location_address_string = self.format__address_4( address, city, state, zippp) permit_subtype = 'Private Investigators/Security Companies' phone = str(row['phone']).replace('.0', '') il = ItemLoader(item=CtSecurityServiceLicensesSpiderItem()) il.add_value( 'url', 'https://portal.ct.gov/-/media/DESPP/files/bondsmanpdf.pdf?la=en' ) il.add_value('sourceName', 'CT_Security_Service_Licenses') il.add_value('permit_subtype', permit_subtype) il.add_value('company_name', self._getDBA(company_name)[0]) il.add_value('person_name', '') il.add_value('dba_name', self._getDBA(company_name)[1]) il.add_value('person_subtype', '') il.add_value('permit_lic_no', permit_lic_no) il.add_value('bail limit', '') il.add_value('permit_lic_status', permit_lic_status) il.add_value('permit_lic_exp_date', permit_lic_exp_date) il.add_value('location_address_string', location_address_string) il.add_value('person_phone', phone) il.add_value('permit_lic_desc', permit_subtype) il.add_value('permit_type', 'security_license') yield il.load_item() if file4: def __extractData(self, pdflink): def rolling_group(val): if pd.notnull(val): rolling_group.group += 1 return rolling_group.group rolling_group.group = 0 def joinFunc(g, column): col = g[column] joiner = "/" s = joiner.join( [str(each) for each in col if pd.notnull(each)]) s = re.sub("(?<=-)" + joiner, " ", s) s = re.sub(joiner * 2, joiner, s) return s def getDF(area, column): df = tabula.read_pdf( file4, pages='all', silent=True, guess=False, columns=column, area=area, encoding='ISO-8859-1', pandas_options={ 'header': None, 'error_bad_lines': False, 'warn_bad_lines': False }).replace('\r', ' ', regex=True).dropna(how='all') return df df = getDF([70.763, 34.425, 756.203, 589.815], [212.67, 470.475, 589.815]) df.columns = ['a', 'b', 'c'] groups = df.groupby(df['c'].apply(rolling_group), as_index=False) groupFunct = lambda g: pd.Series( [joinFunc(g, col) for col in g.columns], index=g.columns) final_df = groups.apply(groupFunct).fillna('') return final_df.to_dict('records') for col in __extractData(self, file4): mixed_name = self._getPersonName(col['a']) company = col['b'].split('/') if len(company) > 2: company_add1 = company[1] company_add2 = company[2] location_address_string = company_add1 + ', ' + company_add2 c_name = company[0] print("==============", c_name) phone = col['c'] permit_subtype = 'Bail Firearms instructors' il = ItemLoader(item=CtSecurityServiceLicensesSpiderItem()) il.add_value( 'url', 'https://portal.ct.gov/-/media/DESPP/files/bondsmanpdf.pdf?la=en' ) il.add_value('sourceName', 'CT_Security_Service_Licenses') il.add_value('permit_subtype', permit_subtype) il.add_value('company_name', c_name) il.add_value('person_name', mixed_name) il.add_value('dba_name', '') il.add_value('person_subtype', '') il.add_value('permit_lic_no', '') il.add_value('bail limit', '') il.add_value('permit_lic_status', '') il.add_value('permit_lic_exp_date', '') il.add_value('location_address_string', location_address_string) il.add_value('person_phone', phone) il.add_value('permit_lic_desc', permit_subtype) il.add_value('permit_type', 'security_license') yield il.load_item() if file5: def __extractData(pdflink): def rolling_group(val): if pd.notnull(val): rolling_group.group += 1 return rolling_group.group rolling_group.group = 0 def joinFunc(g, column): col = g[column] joiner = "/" s = joiner.join( [str(each) for each in col if pd.notnull(each)]) s = re.sub("(?<=-)" + joiner, " ", s) s = re.sub(joiner * 2, joiner, s) return s def getDF(area, column): df = tabula.read_pdf( file5, pages='all', silent=True, guess=False, columns=column, area=area, encoding='ISO-8859-1', pandas_options={ 'header': 'infer', 'error_bad_lines': False, 'warn_bad_lines': False }).replace('\r', ' ', regex=True).dropna(how='all') return df df = getDF([70.785, 23.76, 581.625, 713.79], [183.15, 269.28, 446.49, 672.21]) df.columns = ['a', 'b', 'c', 'd'] groups = df.groupby(df['a'].apply(rolling_group), as_index=False) groupFunct = lambda g: pd.Series( [joinFunc(g, col) for col in g.columns], index=g.columns) final_df = groups.apply(groupFunct).fillna('') return final_df.to_dict('records') for col in __extractData(file5): person_name = col['a'] mail = col['c'].split('/') if len(mail) == 1: phone = mail[0] else: phone = mail[1] add = col['d'].split('/') company_name = add[0] location_address_string = add[1] + ', ' + add[2] permit_subtype = 'Bail enforcement instructors' il = ItemLoader(item=CtSecurityServiceLicensesSpiderItem()) il.add_value( 'url', 'https://portal.ct.gov/-/media/DESPP/files/bondsmanpdf.pdf?la=en' ) il.add_value('sourceName', 'CT_Security_Service_Licenses') il.add_value('permit_subtype', permit_subtype) il.add_value('company_name', company_name) il.add_value('person_name', person_name) il.add_value('dba_name', '') il.add_value('person_subtype', 'Instructors') il.add_value('permit_lic_no', '') il.add_value('bail limit', '') il.add_value('permit_lic_status', '') il.add_value('permit_lic_exp_date', '') il.add_value('location_address_string', location_address_string) il.add_value('person_phone', phone) il.add_value('permit_lic_desc', permit_subtype) il.add_value('permit_type', 'security_license') yield il.load_item() if file6: def __extractData(pdflink): def rolling_group(val): if pd.notnull(val): rolling_group.group += 1 return rolling_group.group rolling_group.group = 0 def joinFunc(g, column): col = g[column] joiner = "/" s = joiner.join( [str(each) for each in col if pd.notnull(each)]) s = re.sub("(?<=-)" + joiner, " ", s) s = re.sub(joiner * 2, joiner, s) return s def getDF(area, column): df = tabula.read_pdf( file6, pages='all', silent=True, guess=False, columns=column, area=area, encoding='ISO-8859-1', pandas_options={ 'header': 'infer', 'error_bad_lines': False, 'warn_bad_lines': False }).replace('\r', ' ', regex=True).dropna(how='all') return df df = getDF([73.755, 42.57, 572.715, 770.22], [ 156.42, 218.79, 253.44, 306.9, 398.97, 489.06, 682.11, 765.27 ]) df.columns = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'] groups = df.groupby(df['h'].apply(rolling_group), as_index=False) groupFunct = lambda g: pd.Series( [joinFunc(g, col) for col in g.columns], index=g.columns) final_df = groups.apply(groupFunct).fillna('') return final_df.to_dict('records') for col in __extractData(file6): person_name = col['a'].split('/') f_name = person_name[0].split(',') permit_lic_no = person_name[1] new_lic_no = re.search(r'\d\d-\d*', permit_lic_no) lic_no = '' if new_lic_no: lic_no = new_lic_no.group() if len(f_name) == 1: name = f_name[0] else: name = f_name[1] + ' ' + f_name[0] permit_lic_status = col['f'].split('/')[0].replace('0', '') location = col['g'].split('/') location_address_string = location[0] + ', ' + location[ 1] + ' ' + location[2] phone = col['h'] permit_subtype = 'Public Security Instructors' il = ItemLoader(item=CtSecurityServiceLicensesSpiderItem()) il.add_value( 'url', 'https://portal.ct.gov/-/media/DESPP/files/bondsmanpdf.pdf?la=en' ) il.add_value('sourceName', 'CT_Security_Service_Licenses') il.add_value('permit_subtype', permit_subtype) il.add_value('company_name', name) il.add_value('person_name', name) il.add_value('dba_name', '') il.add_value('person_subtype', 'Instructors') il.add_value('permit_lic_no', lic_no) il.add_value('bail limit', '') il.add_value('permit_lic_status', permit_lic_status) il.add_value('permit_lic_exp_date', '') il.add_value('location_address_string', location_address_string) il.add_value('person_phone', phone) il.add_value('permit_lic_desc', permit_subtype) il.add_value('permit_type', 'security_license') yield il.load_item() if file7: self.check_val = False def _extractpdf(self, response): def rolling_group(val): match = re.search(r'(\([\d]{3}\))[\s][\d]{3}-[\d]{4}', val) if pd.notnull(val) and match: self.check_val = True elif self.check_val: rolling_group.group += 1 self.check_val = False return rolling_group.group rolling_group.group = 0 df = tabula.read_pdf(file7, pages='all', guess=False, columns=[263.16, 573.75], encoding='ISO-8859-1', pandas_options={ 'header': None }).fillna('') df = df.drop([i for i in range(11)]) Series = df[0].append(df[1]).reset_index(drop=True) df = Series.to_frame(name=None) groups = df.groupby(df[0].apply(rolling_group), as_index=False) for i in groups: x = pd.DataFrame(i[1]).reset_index(drop=True) if (x.apply(len).values[0]) > 1: x.replace('', np.nan, inplace=True) df1 = x.apply(lambda x: pd.Series(x.dropna().values)) df1[0] = df1.apply( lambda x: x[0] if not x[0].isdigit() and len(x) != 0 else np.nan, axis=1) df1[1] = x[0][0] df1[3] = x[0][1] df1[0] = df1.apply(lambda x: x[0] if not x[0] == x[1] else np.nan, axis=1) #dba name df1[2] = df1.apply( lambda x: x[0] if str(x[0]).startswith('(') else np.nan, axis=1) df1[0] = df1.apply(lambda x: x[0] if x[0] != x[2] else np.nan, axis=1) df1[0] = df1.apply(lambda x: x[0] if x[0] != x[3] else np.nan, axis=1) df1[4] = df1[0].str.cat(sep=', ') df1 = df1.drop([0], axis=1) df1[2].fillna(method='bfill', inplace=True) df1.drop_duplicates(subset=1, inplace=True) df1 = df1.fillna('') df1.columns = [ 'person_name', 'company_name', 'company_phone', 'location' ] final_df = df1.to_dict('records') # print(final_df) yield final_df for col in _extractpdf(self, response): # print(col) for row in col: print('________________', row) permit_subtype = 'Bail Firearms instructors' il = ItemLoader(item=CtSecurityServiceLicensesSpiderItem()) il.add_value( 'url', 'https://portal.ct.gov/-/media/DESPP/files/bondsmanpdf.pdf?la=en' ) il.add_value('sourceName', 'CT_Security_Service_Licenses') il.add_value('permit_subtype', permit_subtype) il.add_value('company_name', row['company_name']) il.add_value('person_name', row['person_name']) il.add_value('dba_name', '') il.add_value('person_subtype', 'Instructors') il.add_value('permit_lic_no', '') il.add_value('bail limit', '') il.add_value('permit_lic_status', '') il.add_value('permit_lic_exp_date', '') il.add_value('location_address_string', row['location']) il.add_value('person_phone', row['company_phone']) il.add_value('permit_lic_desc', permit_subtype) il.add_value('permit_type', 'security_license') yield il.load_item()
class IlSwimmingFacilityLicensesSpider(CommonSpider): name = 'il_swimming_facility_licenses' allowed_domains = ['illinois.gov'] start_urls = [ 'http://ehlicv5pub.illinois.gov/Clients/ILDOHENV/PUBLIC/Swimming_Verifications.aspx' ] custom_settings = { 'FILE_NAME': Utils.getRundateFileName( 'AI-1401_Licenses_Swimming_Facility_IL_CurationReady'), 'JIRA_ID': 'AI_1401', 'COOKIES_ENABLED': True, 'COOKIES_DEBUG': True, 'HTTPCACHE_ENABLED': False, 'DOWNLOAD_DELAY': 0.5, 'TOP_HEADER': { 'aquatic features type': 'Aquatic Features Type', 'company_name': 'Facility Name', 'county': 'County', 'dba_name': '', 'location': 'Location', 'location_address_string': 'Facility Address', 'permit_lic_desc': '', 'permit_lic_status': 'License Status', 'permit_type': '' }, 'FIELDS_TO_EXPORT': [ 'company_name', 'dba_name', 'location_address_string', 'county', 'aquatic features type', 'location', 'permit_lic_status', 'permit_lic_desc', 'permit_type', 'sourceName', 'url', 'ingestion_timestamp' ], 'NULL_HEADERS': ['location', 'county', 'aquatic features type'] } aqua_1_lis = [] aqua_2_lis = [] check = True key_val = '' searchkeys = [] def parse(self, response): options = response.xpath('//*[@id="qCounty"]/option')[1:] for i in options: option = i.xpath('@value').extract_first() self.searchkeys.append(option) if self.check == True: self.key_val = self.searchkeys.pop(0) form_data = { '__VIEWSTATE': response.xpath( '//*[@id="__VIEWSTATE"]/@value').extract_first(), '__VIEWSTATEGENERATOR': 'A31F86B4', '__EVENTVALIDATION': response.xpath( '//*[@id="__EVENTVALIDATION"]/@value').extract_first(), 'ObjectTypeID': '2177', 'ObjectID': '40', 'qBusinessName': '', 'qStreet': '', 'qCity': '', 'qCounty': str(self.key_val), 'btnSearch': 'Search' } yield scrapy.FormRequest(url=self.start_urls[0], method='POST', callback=self.second_page_crawl, formdata=form_data, dont_filter=True, meta={'option': option}) def second_page_crawl(self, response): c = str(response.text).replace('<br>', '**').replace('</br>', '**') page_content_b = soup(c, "lxml") table = page_content_b.find('table', id="dtgList") soup_row = table.findAll('tr') check = 0 meta = response.meta las_add_lis_1 = '' las_add_lis = [] common_path = response.xpath('//*[@id="dtgList"]//tr')[1:-2] for ind_val, iter_val in enumerate(common_path): facility_name = iter_val.xpath('td[1]/text()').extract_first() facility_name_1 = self._getDBA(facility_name)[0] dba_name = self._getDBA(facility_name)[1] desc = 'Swimming Facility License for ' + facility_name_1 mail_address_string = iter_val.xpath('td[2]/text()').extract() mail_add_lis = [] county = '' check = 3 for mail_ind, mail_add_value in enumerate(mail_address_string): if mail_ind >= 0: if check == 3: mail_add_lis.append(mail_add_value) mail_add_lis_1 = ', '.join(mail_add_lis).replace( ' ', ' ') address = mail_add_lis_1[:mail_add_lis_1.rindex( ", ")] if mail_add_lis_1[:mail_add_lis_1.rindex( " ")].endswith( ',') else mail_add_lis_1[:mail_add_lis_1. rindex(" ")] county = mail_add_lis_1[mail_add_lis_1.rindex(" ") + 1:] adres = re.sub(r'(\d+)$', r'KY \1', address) else: county = mail_add_value else: adres = '' county = '' aqua_1_lis = [] aqua_2_lis = [] aq_location = '' aq_feature = '' aq_feat = '' aq_loct = '' aqua = '' cols = soup_row[ind_val + 1].findAll('td') if cols[2]: ere = str(cols[2]).split('**') for kre in ere: clean_val = self.data_clean(kre) if clean_val and len(clean_val) > 1: aq_feature = clean_val.split('-')[0] aqua_1_lis.append(aq_feature) aqua_1_lis = list(dict.fromkeys(aqua_1_lis)) aq_feat = '; '.join(aqua_1_lis) if aq_feat and len(aq_feat) > 2: if aq_feat[0] == ';': aq_feat = aq_feat.replace(aq_feat[0], '') aq_location = clean_val.split('-')[1] aqua_2_lis.append(aq_location) aqua_2_lis = list(dict.fromkeys(aqua_2_lis)) aq_loct = '; '.join(aqua_2_lis) if aq_loct and len(aq_loct) > 2: if aq_loct[0] == ';': aq_loct = aq_loct.replace(aq_loct[0], '') status = iter_val.xpath('td[4]/text()').extract_first() data_pass = { 'location': aq_loct, 'permit_lic_desc': desc, 'company_name': facility_name_1, 'county': county, 'dba_name': dba_name, 'location_address_string': adres, 'permit_type': 'pool_license', 'permit_lic_status': status, 'aquatic_features_type': aq_feat } yield self.save_to_csv(response, **data_pass) next_page_link = response.xpath( "//td[@colspan='4']/span/following::a/@href").extract_first() next_page_ = response.xpath( "//td[@colspan='4']/span/following::a/text()").extract_first() if next_page_link and 'doPostBack(' in next_page_link: next_page = next_page_link.split("('")[1].split("',")[0] form_data_page = { '__EVENTTARGET': next_page, '__EVENTARGUMENT': response.xpath( '//*[@id="__EVENTARGUMENT"]/@value').extract_first(), '__VIEWSTATE': response.xpath( '//*[@id="__VIEWSTATE"]/@value').extract_first(), '__VIEWSTATEGENERATOR': 'A31F86B4', '__EVENTVALIDATION': response.xpath( '//*[@id="__EVENTVALIDATION"]/@value').extract_first(), 'ObjectTypeID': '2177', 'ObjectID': '40', 'qCounty': '5451' } yield scrapy.FormRequest( url= "http://ehlicv5pub.illinois.gov/Clients/ILDOHENV/PUBLIC/Swimming_Verifications.aspx", callback=self.second_page_crawl, dont_filter=True, formdata=form_data_page, meta=meta) else: yield scrapy.Request(url=self.start_urls[0], callback=self.parse, dont_filter=True) def save_to_csv(self, response, **datum): il = ItemLoader(item=IlSwimmingFacilityLicensesSpiderItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value( 'url', 'http://ehlicv5pub.illinois.gov/Clients/ILDOHENV/PUBLIC/Swimming_Verifications.aspx' ) il.add_value('sourceName', 'IL_Swimming_Facility_Licenses') il.add_value('location', datum['location']) il.add_value('permit_lic_desc', datum['permit_lic_desc']) il.add_value('company_name', datum['company_name']) il.add_value('county', datum['county']) il.add_value('dba_name', datum['dba_name']) il.add_value('location_address_string', datum['location_address_string']) il.add_value('permit_type', datum['permit_type']) il.add_value('permit_lic_status', datum['permit_lic_status']) il.add_value('aquatic features type', datum['aquatic_features_type']) return il.load_item() def data_clean(self, value): if value: try: clean_tags = re.compile('<.*?>') desc_list = re.sub(r'\s+', ' ', re.sub(clean_tags, '', value)) desc_list_rep = desc_list.replace('&', '&').replace( 'Home Phone:', '').replace('Mobile Phone:', '').replace('Fax:', '').replace('Primary Phone:', '') return desc_list_rep.strip() except: return '' else: return ''
class AlGeologyLicensesSpider(CommonSpider): name = '1481_al_geology_licenses' allowed_domains = ['alabama.gov'] start_urls = ['http://www.algeobd.alabama.gov/search.aspx?sm=d_a'] custom_settings = { 'FILE_NAME':Utils.getRundateFileName('AI-1481_Licenses_Geology_AL_CurationReady'), 'JIRA_ID':'AI_1481', 'DOWNLOAD_DELAY':.2, 'COOKIES_ENABLED':True, 'COOKIES_DEBUG':True, 'HTTPCACHE_ENABLED':False, #'JOBDIR' : CustomSettings.getJobDirectory('al_geology_licenses'), 'TOP_HEADER':{ 'company_name': 'Company', 'company_phone': 'Phone', 'dba_name': '', 'location_address_string': 'Address', 'permit_lic_desc': '', 'permit_lic_eff_date': 'Effective Date', 'permit_lic_exp_date': 'Expiration Date', 'permit_lic_no': 'License Number', 'permit_type': ''}, 'FIELDS_TO_EXPORT':['company_name', 'dba_name', 'company_phone', 'location_address_string', 'permit_lic_no', 'permit_lic_eff_date', 'permit_lic_exp_date', 'permit_lic_desc', 'permit_type', 'url', 'sourceName', 'ingestion_timestamp'], 'NULL_HEADERS':[] } def parse(self,response): yield scrapy.Request(url='http://www.algeobd.alabama.gov/search.aspx?sm=d_a',callback=self.parse_next,dont_filter=True,meta={'page':1}) @inline_requests def parse_next(self, response): #inspect_response(response,self) check_None=lambda data: data if data else "" table1=response.xpath("//div[@id='ContentPlaceHolder1_Panel1']/fieldset//tr")[1:-2] #print("--------------->",table1) for i in table1: License=i.xpath(".//td[1]/text()").extract_first() if License.strip(): first_name=check_None(i.xpath(".//td[2]/text()").extract_first()) last_name=check_None(i.xpath(".//td[3]/text()").extract_first()) Person_Name=first_name+' '+last_name #print("--------------->",Person_Name) link=i.xpath(".//td[7]/a/@href").extract_first() link_detail=yield scrapy.Request(url="http://www.algeobd.alabama.gov/"+link,dont_filter=True) company_name=check_None(link_detail.xpath("//td//b[contains(text(),'Company:')]/ancestor::td/following-sibling::td/text()").extract_first()) company_phone=check_None(link_detail.xpath("//td//b[contains(text(),'Phone:')]/ancestor::td/following-sibling::td/text()").extract_first()) Address=check_None(link_detail.xpath("//td//b[contains(text(),'Address:')]/ancestor::td/following-sibling::td/text()").extract_first()) License_Number=check_None(link_detail.xpath("//td//b[contains(text(),'License Number:')]/ancestor::td/following-sibling::td/text()").extract_first()) Effective_Date=check_None(link_detail.xpath("//td//b[contains(text(),'Effective Date:')]/ancestor::td/following-sibling::td/text()").extract_first()) Expiration_Date=check_None(link_detail.xpath("//td//b[contains(text(),'Expiration Date:')]/ancestor::td/following-sibling::td/text()").extract_first()) if "Retired" in company_name: Company_name=Person_Name else: Company_name=company_name #self.state['items_count'] = self.state.get('items_count', 0) + 1 il = ItemLoader(item=AlGeologyLicensesSpiderItem(),response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'AL_Geology_Licenses') il.add_value('url', 'http://www.algeobd.alabama.gov/search.aspx?sm=d_a') il.add_value('company_name', Company_name) il.add_value('dba_name', '') il.add_value('company_phone', company_phone) il.add_value('location_address_string', Address if Address else "AL") il.add_value('permit_lic_no', License_Number) il.add_value('permit_lic_eff_date', Effective_Date) il.add_value('permit_lic_exp_date', Expiration_Date) il.add_value('permit_lic_desc', 'Geology License for '+Company_name if company_name else 'Geology License') il.add_value('permit_type', 'geology_license') yield il.load_item() # print("\n\n\n\n") # print("--------------->",Company_name) # print("--------------->",company_phone) # print("--------------->",Address) # print("--------------->",License_Number) # print("--------------->",Effective_Date) # print("--------------->",Expiration_Date) # print("\n\n\n\n") #inspect_response(link_detail,self) page=response.xpath("//td/span[text()='{}']/ancestor::td/following-sibling::td/a/@href".format(str(response.meta['page']))).extract_first() current_page=response.meta['page']+1 print('----->',current_page,'\n\n\n') if page: form_args_pagn = JavaScriptUtils.getValuesFromdoPost(page) form_data={ '__EVENTTARGET': form_args_pagn['__EVENTTARGET'], '__EVENTARGUMENT': form_args_pagn['__EVENTARGUMENT'], '__VIEWSTATE': response.xpath('//*[@id="__VIEWSTATE"]/@value').extract_first(), '__VIEWSTATEGENERATOR': response.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value').extract_first(), '__EVENTVALIDATION':response.xpath('//*[@id="__EVENTVALIDATION"]/@value').extract_first(), 'ctl00$ContentPlaceHolder1$NumberTextBox': '', 'ctl00$ContentPlaceHolder1$NameTextBox': '', 'ctl00$ContentPlaceHolder1$CityTextBox': '' } # current_page+=1 yield scrapy.FormRequest(url='http://www.algeobd.alabama.gov/search.aspx?sm=d_a',formdata=form_data,callback=self.parse_next,dont_filter=True,meta={'page':current_page})
class IlChampaignBuildingPermitsSpider(CommonSpider): name = '1414_il_champaign_building_permits' allowed_domains = ['champaign.il.us'] start_urls = [ 'http://etrakit.ci.champaign.il.us/etrakit3/Search/permit.aspx' ] handle_httpstatus_list = [500] custom_settings = { 'FILE_NAME': Utils.getRundateFileName( 'AI-1414_Permits_Building_IL_Champaign_CurationReady'), 'JIRA_ID': 'AI_1414', 'DOWNLOAD_DELAY': 0.5, 'CONCURENT_REQUEST': 1, 'TRACKING_OPTIONAL_PARAMS': ['permit_lic_no'], 'COOKIES_ENABLED': True, 'COOKIES_DEBUG': True, 'HTTPCACHE_ENABLED': False, # 'JOBDIR' : CustomSettings.getJobDirectory('il_champaign_building_permits'), 'TOP_HEADER': { 'apn': 'APN', 'approved date': 'Approved Date', 'contractor_address_string': 'Address.2', 'contractor_dba': '', 'dba_name': '', 'finaled date': 'Finaled Date', 'inspection_date': 'Completed', 'inspection_pass_fail': 'Result', 'inspection_subtype': 'Type.1', 'inspection_type': '', 'location_address_string': 'Address', 'mixed_contractor_name': 'CONTRACTOR', 'mixed_name': 'Name', 'mixed_subtype': '', 'notes': 'Notes', 'permit_applied_date': 'Applied Date', 'permit_lic_desc': 'Short Description', 'permit_lic_eff_date': 'Issued Date', 'permit_lic_exp_date': 'Expiration Date', 'permit_lic_fee': 'Fees', 'permit_lic_no': 'Permit #', 'permit_lic_status': 'Status', 'permit_subtype': 'Type', 'permit_type': '', 'person_address_string': 'Address.1', 'property type': 'Property Type', 'scheduled date': 'Scheduled Date', 'subtype': 'Subtype' }, 'FIELDS_TO_EXPORT': [ 'permit_lic_no', 'permit_subtype', 'subtype', 'permit_lic_desc', 'permit_lic_status', 'permit_applied_date', 'approved date', 'permit_lic_eff_date', 'finaled date', 'permit_lic_exp_date', 'notes', 'location_address_string', 'property type', 'apn', 'mixed_name', 'dba_name', 'mixed_subtype', 'person_address_string', 'mixed_contractor_name', 'contractor_dba', 'contractor_address_string', 'permit_lic_fee', 'inspection_subtype', 'inspection_pass_fail', 'scheduled date', 'inspection_date', 'inspection_type', 'permit_type', 'sourceName', 'url', 'ingestion_timestamp' ], 'NULL_HEADERS': [ 'apn', 'scheduled date', 'subtype', 'notes', 'finaled date', 'approved date', 'property type' ] } search_element = [] check_first = True end_date = '' def parse(self, response): if self.check_first: self.check_first = False self.search_element = SearchCriteria.dateRange( self.start, self.end, freq='1D', formatter='%m/%d/%Y') self.end_date = self.search_element.pop(0) if len(self.search_element) > 0: start_date = copy.copy(self.end_date) self.end_date = self.search_element.pop(0) formdata = { 'ctl00$RadScriptManager1': 'ctl00$RadScriptManager1|ctl00$cplMain$btnSearch', '__EVENTTARGET': ' ctl00$cplMain$btnSearch', '__EVENTARGUMENT': '', '__VIEWSTATE': response.xpath( '//*[@id="__VIEWSTATE"]/@value').extract_first(), '__VIEWSTATEGENERATOR': '2A136539', 'ctl00$ucLogin$hfDashboardRedirect': 'https://etrakit.champaignil.gov/etrakit/dashboard.aspx', 'ctl00$ucLogin$hfCartRedirect': 'https://etrakit.champaignil.gov/etrakit/ShoppingCart.aspx?iscartview=true', 'ctl00$ucLogin$hfHome': ' https://etrakit.champaignil.gov/etrakit/default.aspx', 'ctl00_ucLogin_RadTextBox2_ClientState': '{"enabled":true,"emptyMessage":"Password","validationText":"","valueAsString":"","lastSetTextBoxValue":"Password"}', 'ctl00_ucLogin_txtPassword_ClientState': '{"enabled":true,"emptyMessage":"","validationText":"","valueAsString":"","lastSetTextBoxValue":""}', 'ctl00$cplMain$ddSearchBy': 'Permit_Main.ISSUED', 'ctl00$cplMain$ddSearchOper': 'EQUALS', 'ctl00$cplMain$txtSearchString': str(start_date), 'ctl00_cplMain_tcSearchDetails_ClientState': '{"selectedIndexes":["2"],"logEntries":[],"scrollState":{}}', '__ASYNCPOST': 'true', 'RadAJAXControlID': 'ctl00_RadAjaxManager1' } yield scrapy.FormRequest(url=response.url, formdata=formdata, method='POST', dont_filter=True, callback=self.parse_details, meta={ 'page': 1, 'start_date': str(start_date) }) @inline_requests def parse_details(self, response): page_no = response.meta['page'] start_date = response.meta['start_date'] tr_list = response.xpath( '//*[@id="ctl00_cplMain_rgSearchRslts_ctl00"]//tr[@class="rgRow" or @class="rgAltRow"]' ) for ind, tr in enumerate(tr_list): value = 'RowClick;' + str(ind) permit_lic_no = tr.xpath('td[1]/text()').extract_first() form_data_link = { 'ctl00$RadScriptManager1': 'ctl00$ctl00$cplMain$rgSearchRsltsPanel|ctl00$cplMain$rgSearchRslts', 'ctl00$cplMain$ddSearchBy': 'Permit_Main.ISSUED', 'ctl00$cplMain$ddSearchOper': 'EQUALS', 'ctl00$cplMain$txtSearchString': str(start_date), 'ctl00$ucLogin$hfDashboardRedirect': 'https://etrakit.champaignil.gov/etrakit/dashboard.aspx', 'ctl00$ucLogin$hfCartRedirect': 'https://etrakit.champaignil.gov/etrakit/ShoppingCart.aspx?iscartview=true', 'ctl00$ucLogin$hfViewEditProfile': 'static value', 'ctl00$ucLogin$hfHome': 'https://etrakit.champaignil.gov/etrakit/default.aspx', '__EVENTTARGET': 'ctl00$cplMain$rgSearchRslts', '__EVENTARGUMENT': str(value), '__VIEWSTATE': response.text.split('__VIEWSTATE|')[1].split('|')[0], '__VIEWSTATEGENERATOR': '2A136539', '__ASYNCPOST': 'true', 'RadAJAXControlID': 'ctl00_RadAjaxManager1' } parse_response = yield scrapy.FormRequest( url= 'https://etrakit.champaignil.gov/etrakit/Search/permit.aspx', method='POST', formdata=form_data_link, dont_filter=True, meta={ 'permit_lic_no': permit_lic_no, 'optional': { 'permit_lic_no': permit_lic_no } }) permit_lic_no = parse_response.meta['permit_lic_no'] permit_subtype = parse_response.xpath( '//*[@id="cplMain_ctl08_lblPermitType"]/text()').extract_first( ) subtype = parse_response.xpath( '//*[@id="cplMain_ctl08_lblPermitSubtype"]/text()' ).extract_first() permit_lic_desc = parse_response.xpath( '//*[@id="cplMain_ctl08_lblPermitDesc"]/text()').extract_first( ) permit_lic_status = parse_response.xpath( '//*[@id="cplMain_ctl08_lblPermitStatus"]/text()' ).extract_first() permit_applied_date = parse_response.xpath( '//*[@id="cplMain_ctl08_lblPermitAppliedDate"]/text()' ).extract_first() approved_date = parse_response.xpath( '//*[@id="cplMain_ctl08_lblPermitApprovedDate"]/text()' ).extract_first() permit_lic_eff_date = parse_response.xpath( '//*[@id="cplMain_ctl08_lblPermitIssuedDate"]/text()' ).extract_first() finaled_date = parse_response.xpath( '//*[@id="cplMain_ctl08_lblPermitFinaledDate"]/text()' ).extract_first() permit_lic_exp_date = parse_response.xpath( '//*[@id="cplMain_ctl08_lblPermitExpirationDate"]/text()' ).extract_first() notes1 = parse_response.xpath( '//*[@id="cplMain_ctl08_lblPermitNotes"]/text()').extract() notes = self.val_strip(','.join(notes1)) loc_addr_1 = parse_response.xpath( 'normalize-space(//*[@id="cplMain_ctl09_hlSiteAddress"]/text())' ).extract_first() loc_addr_2 = parse_response.xpath( 'normalize-space(//*[@id="cplMain_ctl09_lblSiteCityStateZip"]/text())' ).extract_first() if loc_addr_2 and len(loc_addr_2) > 2: location_address_string = loc_addr_1 + ', ' + loc_addr_2 elif loc_addr_1 and len(loc_addr_1) > 2: location_address_string = loc_addr_1 + ', IL' else: location_address_string = 'IL' if 'IL,' in location_address_string: location_address_string = location_address_string.replace( 'IL,', 'IL') else: location_address_string = location_address_string property_type = parse_response.xpath( '//*[@id="cplMain_ctl09_lblPropertyType"]/text()' ).extract_first() apn = parse_response.xpath( '//*[@id="cplMain_RadPageViewSiteInfo"]//tr[4]//a/text()' ).extract_first() permit_lic_fee = parse_response.xpath( '//*[@id="cplMain_ctl11_lblTotalFees"]/text()[2]' ).extract_first() insp_list = [] insp = parse_response.xpath( '//*[@id="ctl00_cplMain_ctl12_rgInspectionInfo_ctl00"]//tr' )[1:] stories = year_built = '' for ins in insp: inspection_subtype = ins.xpath('td[1]/text()').extract_first() inspection_pass_fail = ins.xpath( 'td[2]/text()').extract_first() scheduled_date = ins.xpath('td[3]/span/text()').extract_first() inspection_date = ins.xpath('td[5]/text()').extract_first() inspection_type = 'building_inspection' insp_dict = { 'inspection_subtype': inspection_subtype, 'inspection_pass_fail': inspection_pass_fail, 'scheduled_date': scheduled_date, 'inspection_date': inspection_date, 'inspection_type': inspection_type } insp_list.append(insp_dict) data_pass = { 'permit_lic_no': permit_lic_no, 'mixed_name': '', 'mixed_contractor_name': '', 'permit_subtype': permit_subtype, 'subtype': subtype, 'permit_lic_desc': permit_lic_desc, 'permit_lic_status': permit_lic_status, 'permit_applied_date': permit_applied_date, 'approved_date': approved_date, 'permit_lic_eff_date': permit_lic_eff_date, 'finaled_date': finaled_date, 'permit_lic_exp_date': permit_lic_exp_date, 'notes': notes, 'location_address_string': self.val_strip(location_address_string), 'property_type': property_type, 'apn': apn, 'person_address_string': '', 'mixed_subtype': '', 'inspection_subtype': '', 'inspection_pass_fail': '', 'scheduled_date': '', 'inspection_date': '', 'inspection_type': '', 'permit_type': '', 'stories': '', 'year_built': '', 'dba_name': '', 'permit_lic_fee': permit_lic_fee, 'contractor_address_string': '', 'contractor_dba': '' } check = 0 mixed_table = parse_response.xpath( '//*[@id="ctl00_cplMain_ctl10_rgContactInfo_ctl00"]//tr[@class="rgRow" or @class="rgAltRow"]' ) name_list = [] for mixed in mixed_table: mixed_subtype = mixed.xpath('td[1]/text()').extract_first() if mixed_subtype and 'OWNER' in mixed_subtype: dup_key = 'A' elif mixed_subtype and 'CONTRACTOR' in mixed_subtype: dup_key = 'B' elif mixed_subtype and 'HVAC' in mixed_subtype: dup_key = 'C' elif mixed_subtype and 'PLUMBING' in mixed_subtype: dup_key = 'D' elif mixed_subtype and 'ELECTRICAL' in mixed_subtype: dup_key = 'E' elif mixed_subtype and 'SPRINKLER' in mixed_subtype: dup_key = 'F' elif mixed_subtype and 'TENANT' in mixed_subtype: dup_key = 'G' elif mixed_subtype and 'ARCHITECT' in mixed_subtype: dup_key = 'H' elif mixed_subtype and 'APPLICANT' in mixed_subtype: dup_key = 'I' else: dup_key = 'J' name = mixed.xpath('td[2]/text()').extract_first() mixed_name = self._getDBA(name)[0] dba_name = self._getDBA(name)[1] person_add1 = mixed.xpath('td[5]/text()').extract_first() person_add2 = mixed.xpath('td[6]/text()').extract_first() if person_add2 and len(person_add2) > 2: person_address_string = person_add1 + ', ' + person_add2 elif person_add1 and len(person_add1) > 2: person_address_string = person_add1 + ', IL' else: person_address_string = 'IL' mixed_dict = { 'mixed_name': mixed_name, 'dba_name': dba_name, 'mixed_subtype': mixed_subtype, 'person_address_string': person_address_string, 'dup_key': dup_key } name_list.append(mixed_dict) name_list_val = [] if len(name_list) > 0: df = pd.DataFrame([i for i in name_list if i]).drop_duplicates().sort_values( 'dup_key', ascending=True).fillna('') def joinFunc(g, column): val = [ str(each) for each in g[column] if pd.notnull(each) and str(each) ] if val: return val[0] return '' groups = df.groupby('mixed_name', as_index=False) groupFunct = lambda g: pd.Series( [joinFunc(g, col) for col in g.columns], index=g.columns) df = groups.apply(groupFunct).fillna('').reset_index(drop=True) name_list_val = df.to_dict('records') if len(name_list_val) > 0: for name_val in name_list_val: if name_val[ 'mixed_name'] and 'CONTRACTOR' not in name_val[ 'mixed_subtype']: own_dict = {} own_dict = data_pass.copy() own_dict.update(name_val) check = 1 yield self.save_to_csv(response, **own_dict) if 'CONTRACTOR' in name_val['mixed_subtype']: cont_dic = {} cont_dic = data_pass.copy() cont_dic['mixed_contractor_name'] = name_val[ 'mixed_name'] cont_dic['contractor_address_string'] = name_val[ 'person_address_string'] cont_dic['contractor_dba'] = name_val['dba_name'] cont_dic['mixed_subtype'] = '' check = 3 yield self.save_to_csv(parse_response, **cont_dic) if insp_list and len(insp_list) > 0: for inspect in insp_list: ins_dict = {} ins_dict = data_pass.copy() ins_dict.update(inspect) check = 4 yield self.save_to_csv(parse_response, **ins_dict) if check == 0: yield self.save_to_csv(parse_response, **data_pass) page = response.xpath( '//*[@id="ctl00_cplMain_rgSearchRslts_ctl00"]/tfoot//tr[5]/td/span/text()' ).extract_first() if page: next_page = page.split(' of ')[1] head = { 'Accept': '*/*', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'origin': 'https://etrakit.champaignil.gov', 'referer': 'https://etrakit.champaignil.gov/etrakit/Search/permit.aspx', 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36' } if int(page_no) < int(next_page) + 1: page_data = { 'ctl00$RadScriptManager1': 'ctl00$ctl00$cplMain$rgSearchRsltsPanel|ctl00$cplMain$rgSearchRslts', 'RadScriptManager1_TSM': ';;System.Web.Extensions, Version=4.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35:en-US:b7585254-495e-4311-9545-1f910247aca5:ea597d4b:b25378d2;Telerik.Web.UI, Version=2013.2.717.40, Culture=neutral, PublicKeyToken=121fae78165ba3d4:en-US:0507d587-20ad-4e22-b866-76bd3eaee2df:16e4e7cd:ed16cbdc:f7645509:24ee1bba:92fe8ea0:f46195d3:fa31b949:874f8ea2:19620875:490a9d4e:bd8f85e4:b7778d6c:58366029:e330518b:1e771326:8e6f0d33:6a6d718d;', 'ctl00_ucLogin_rwmLogin_ClientState': '', 'ctl00$ucLogin$hfDashboardRedirect': 'https://etrakit.champaignil.gov/etrakit/dashboard.aspx', 'ctl00$ucLogin$hfCartRedirect': 'https://etrakit.champaignil.gov/etrakit/ShoppingCart.aspx?iscartview=true', 'ctl00$ucLogin$hfViewEditProfile': 'static value', 'ctl00$ucLogin$hfHome': 'https://etrakit.champaignil.gov/etrakit/default.aspx', 'ctl00$ucLogin$hfSetupAnAccountForPublic': 'https://etrakit.champaignil.gov/etrakit/publicUserAccount.aspx?action=npa', 'ctl00$ucLogin$hfSetupAnAccountForContractor': 'https://etrakit.champaignil.gov/etrakit/RegistrationConfirmation.aspx', 'ctl00$ucLogin$hfContractorCSLBVerification': 'DISABLED', 'ctl00$ucLogin$ddlSelLogin': '******', 'ctl00$ucLogin$ddlSelContractor': ' A. G', 'ctl00$ucLogin$RadTextBox2': 'Password', 'ctl00_ucLogin_RadTextBox2_ClientState': '{"enabled":true,"emptyMessage":"Password","validationText":"","valueAsString":"","lastSetTextBoxValue":"Password"}', 'ctl00$ucLogin$txtPassword': '', 'ctl00_ucLogin_txtPassword_ClientState': '{"enabled":true,"emptyMessage":"","validationText":"","valueAsString":"","lastSetTextBoxValue":""}', 'ctl00$hfGoogleKey': '', 'ctl00$cplMain$activeTab': '', 'ctl00$cplMain$hfActivityMode': '', 'ctl00$cplMain$ddSearchBy': 'Permit_Main.ISSUED', 'ctl00$cplMain$ddSearchOper': 'EQUALS', 'ctl00$cplMain$txtSearchString': str(start_date), 'ctl00_cplMain_rgSearchRslts_ClientState': '', 'ctl00_cplMain_tcSearchDetails_ClientState': '{"selectedIndexes":["2"],"logEntries":[],"scrollState":{}}', 'ctl00_cplMain_RadMultiPageSearch_ClientState': '', 'ctl00_cplMain_rw_ClientState': '', '__EVENTTARGET': 'ctl00$cplMain$rgSearchRslts', '__EVENTARGUMENT': 'FireCommand:ctl00$cplMain$rgSearchRslts$ctl00;Page;next', '__LASTFOCUS': '', '__VIEWSTATE': response.text.split('__VIEWSTATE|')[1].split('|')[0], '__VIEWSTATEGENERATOR': '2A136539', '__ASYNCPOST': 'true', 'RadAJAXControlID': 'ctl00_RadAjaxManager1' } yield scrapy.FormRequest( url= 'https://etrakit.champaignil.gov/etrakit/Search/permit.aspx', method='POST', callback=self.parse_details, headers=head, dont_filter=True, meta={ 'page': int(page_no) + 1, 'start_date': start_date }, formdata=page_data) else: yield scrapy.Request(url=self.start_urls[0], callback=self.parse, dont_filter=True) else: yield scrapy.Request(url=self.start_urls[0], callback=self.parse, dont_filter=True) def save_to_csv(self, response, **meta): il = ItemLoader(item=IlChampaignBuildingPermitsSpiderItem(), response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value( 'url', 'http://etrakit.ci.champaign.il.us/etrakit3/Search/permit.aspx') il.add_value('sourceName', 'IL_Champaign_Building_Permits') il.add_value('finaled date', meta['finaled_date']) il.add_value('inspection_date', meta['inspection_date']) il.add_value('contractor_dba', meta['contractor_dba']) il.add_value('mixed_contractor_name', meta['mixed_contractor_name']) il.add_value('dba_name', meta['dba_name']) il.add_value('apn', meta['apn']) il.add_value('permit_lic_fee', meta['permit_lic_fee']) il.add_value('location_address_string', meta['location_address_string']) il.add_value('person_address_string', meta['person_address_string']) il.add_value('subtype', meta['subtype']) il.add_value('permit_subtype', meta['permit_subtype']) il.add_value('inspection_subtype', meta['inspection_subtype']) il.add_value('mixed_subtype', meta['mixed_subtype']) il.add_value('contractor_address_string', meta['contractor_address_string']) il.add_value('permit_lic_status', meta['permit_lic_status']) il.add_value('permit_lic_exp_date', meta['permit_lic_exp_date']) il.add_value('permit_lic_no', meta['permit_lic_no']) il.add_value('notes', meta['notes']) il.add_value('property type', meta['property_type']) il.add_value('mixed_name', meta['mixed_name']) il.add_value('inspection_pass_fail', meta['inspection_pass_fail']) il.add_value('approved date', meta['approved_date']) il.add_value('permit_lic_eff_date', meta['permit_lic_eff_date']) il.add_value('permit_applied_date', meta['permit_applied_date']) il.add_value('scheduled date', meta['scheduled_date']) il.add_value( 'permit_lic_desc', meta['permit_lic_desc'] if meta['permit_lic_desc'] and len(meta['permit_lic_desc']) > 2 else meta['permit_subtype'] if meta['permit_subtype'] and len(meta['permit_subtype']) > 2 else 'Building Permit') il.add_value('inspection_type', meta['inspection_type']) il.add_value('permit_type', 'building_permit') return il.load_item() def val_strip(self, value): if value: try: str_value = value.replace('&', '').replace('#', '') str_value = re.sub('\s+', ' ', str_value) return str_value.strip() except: return '' else: return ''
class NvClarkBuildingPermitsSpider(CommonSpider): name = '578_nv_clark_building_permits' allowed_domains = ['clarkcountynv.gov'] start_urls = [ 'https://citizenaccess.clarkcountynv.gov/CitizenAccess/Cap/CapHome.aspx?module=Building&TabName=Building' ] main_url = 'https://citizenaccess.clarkcountynv.gov/CitizenAccess/' modifyformdata = {} form_url = None custom_settings = { 'FILE_NAME': Utils.getRundateFileName( 'AI-578_Permits_Buildings_NV_Clark_20050101_20190101'), 'JIRA_ID': 'AI_578', 'DOWNLOAD_DELAY': .5, 'COOKIES_ENABLED': True, 'COOKIES_DEBUG': True, 'HTTPCACHE_ENABLED': False, # 'JOBDIR' : CustomSettings.getJobDirectory('NvClarkBuildingPermitsSpider'), 'TOP_HEADER': { 'Conditions': '', 'location_address_string': 'Work Location', 'mixed_contractor_address_string': 'Contractor Address', 'mixed_contractor_name': 'Contractor Name', 'mixed_contractor_phone': 'Contractor Phone', 'mixed_name': 'Owner Name', 'mixed_phone': 'Owner Phone', 'mixed_subtype': '', 'occupancy_subtype': 'Application Type ', 'parcel number': 'Parcel Number', 'permit_lic_desc': 'Project Description', 'permit_lic_eff_date': 'Date', 'permit_lic_exp_date': 'Plan Expiration', 'permit_lic_no': 'Permit Number', 'permit_lic_status': 'Permit/Complaint Status', 'permit_lic_value': 'Job Value($)', 'permit_subtype': 'Permit Type', 'permit_type': '', 'person_address_string': 'Owner Address', 'project name': 'Project Name', 'short notes': 'Short Notes' }, 'FIELDS_TO_EXPORT': [ 'permit_lic_eff_date', 'permit_lic_no', 'permit_subtype', 'Conditions', 'permit_lic_desc', 'occupancy_subtype', 'permit_lic_value', 'project name', 'permit_lic_status', 'location_address_string', 'parcel number', 'short notes', 'permit_lic_exp_date', 'mixed_contractor_name', 'mixed_contractor_address_string', 'mixed_contractor_phone', 'mixed_name', 'mixed_subtype', 'person_address_string', 'mixed_phone', 'permit_type', 'url', 'sourceName', 'ingestion_timestamp' ], 'NULL_HEADERS': ['project name', 'parcel number', 'short notes'] } check = True def parse(self, response): if self.check: self.start = datetime.datetime.strptime( self.start, '%Y%m%d').strftime('%m/%d/%Y') self.end = datetime.datetime.strptime( self.end, '%Y%m%d').strftime('%m/%d/%Y') self.check = False form = _get_form(response, formname=None, formid='aspnetForm', formnumber=0, formxpath=None) formdata = _get_inputs( form, formdata={ 'ctl00$PlaceHolderMain$generalSearchForm$txtGSStartDate': self.start, 'ctl00$PlaceHolderMain$generalSearchForm$txtGSEndDate': self.end, '__EVENTTARGET': 'ctl00$PlaceHolderMain$btnNewSearch' }, clickdata=None, dont_click=False, response=response) self.modifyformdata = dict(formdata) self.form_url = response.url yield scrapy.FormRequest(url=response.url, method='POST', formdata=formdata, dont_filter=True, errback=self.handle_form_error, callback=self.form_postresponse) def form_postresponse(self, response): form_args = JavaScriptUtils.getValuesFromdoPost( response.xpath( "//*[@id='ctl00_PlaceHolderMain_dgvPermitList_gdvPermitList_gdvPermitListtop4btnExport']/@href" ).extract_first()) form = _get_form(response, formname=None, formid='aspnetForm', formnumber=0, formxpath=None) formdata = _get_inputs( form, formdata={ '__EVENTARGUMENT': form_args['__EVENTARGUMENT'], '__ASYNCPOST': 'true', '__EVENTTARGET': form_args['__EVENTTARGET'], 'ctl00$ScriptManager1': 'ctl00$PlaceHolderMain$dgvPermitList$updatePanel|{}'.format( form_args['__EVENTTARGET']) }, clickdata=None, dont_click=False, response=response) yield scrapy.FormRequest(url=response.url, method='POST', formdata=formdata, dont_filter=True, errback=self.handle_form_error, callback=self.response_download) def response_download(self, _): now = datetime.datetime.now() yield scrapy.Request('{}Export2CSV.ashx?flag={}'.format( self.main_url, '%02d%02d' % (now.second, now.minute)), callback=self.getCSVFileReponse, meta={'page': 1}) @inline_requests def getCSVFileReponse(self, response): print('=====================================') rawData = pd.read_csv(io.StringIO(response.text)).fillna('') count = 0 for _row2 in rawData.to_dict('records'): print(_row2.get('Permit Number')) _row2['Record Number'] = '001166-19PA' getLandingPage_res = yield scrapy.Request( self.start_urls[0], meta={ 'optional': { 'Permit_Number': _row2['Permit Number'] }, 'cookiejar': _row2['Permit Number'], 'data': _row2 }, dont_filter=True) _row1 = getLandingPage_res.meta['data'] update_formdata = { 'ctl00$PlaceHolderMain$generalSearchForm$txtGSStartDate': self.start, 'ctl00$PlaceHolderMain$generalSearchForm$txtGSEndDate': self.end, '__EVENTTARGET': 'ctl00$PlaceHolderMain$btnNewSearch', 'ctl00$PlaceHolderMain$generalSearchForm$txtGSPermitNumber': _row1['Permit Number'] } getPermitLic_View_res = yield scrapy.FormRequest( url=self.form_url, method='POST', formdata={ **self.modifyformdata, **update_formdata }, meta={ 'optional': { 'Permit_Number': _row1['Permit Number'] }, 'data': _row1 }, dont_filter=True) check_insert = 0 _row = getPermitLic_View_res.meta['data'] project_description = getPermitLic_View_res.xpath( '//span[contains(text(),"Project Description:")]/ancestor::div[1]//td/text()' ).extract_first() occupancy_subtype = getPermitLic_View_res.xpath( ' //*[contains(text(), "Application Type - ")]/ancestor::*/following-sibling::*/span/text()' ).extract_first() condition = getPermitLic_View_res.xpath( '//span[@id="ctl00_PlaceHolderMain_capConditions_lblNotice"]//text()' ).extract() condition = ''.join(condition) print('\n\n') print('--------------------', condition) permit_lic_value = getPermitLic_View_res.xpath( '//h2[contains(text(),"Job Value($):")]/ancestor::span/following-sibling::span/text()' ).extract_first() if permit_lic_value: permit_lic_value = permit_lic_value.replace('$', '') parcel_number = getPermitLic_View_res.xpath( '//h2[contains(text(),"Parcel Number:")]/following-sibling::div/text()' ).extract_first() address_string = getPermitLic_View_res.xpath( '//*[@id="tbl_worklocation"]//tr/td[2]//text()').extract() address_string1 = ','.join(address_string) address_string1 = address_string1.rstrip(',') location_address_string = address_string1 if self.state_list( address_string1) else re.sub(r'(\d+)$', r'NV \1', address_string1) location_address_string = location_address_string.replace(',', '') location_address_string = location_address_string + ', NV' if location_address_string else 'NV' # if parcel_number is None: # location_address_string='' # else: # location_address_string=location_address_string rm = lambda data: '' if data is None else re.sub('\s+', ' ', data) contractor_name = getPermitLic_View_res.xpath( '//table[@id="tbl_licensedps"]//tr/td[2]/text()' ).extract_first() contractor_address = '' contractor_phone = '' if contractor_name: contractor_add = getPermitLic_View_res.xpath( '//table[@id="tbl_licensedps"]//tr//td//text()').extract( )[2:6] contractor_address1 = [ x for x in contractor_add if 'Phone' not in x and 'Contractor' not in x and 'Neveda' not in x and 'Owner' not in x and 'View' not in x ] contractor_address2 = ''.join(contractor_address1).replace( ',United States', '').replace('ATTN: ', '').replace('(702) 275-3512', '') contractor_address = ''.join( self.split(contractor_address2, ',', -1)) print('\n\n') print('____________________:', contractor_address) contractor_phone = getPermitLic_View_res.xpath( '//td[contains(text(),"Home Phone:")]/following::div[@class="ACA_PhoneNumberLTR"]/text()' ).extract_first() owner_name = getPermitLic_View_res.xpath( '///span[contains(text(),"Owner:")]//following::tr[2]/td/text()' ).extract_first() owner_address = '' owner_phone = '' mixed_subtype = '' if owner_name: if owner_name: mixed_subtype = 'Owner' else: mixed_subtype = '' owner_add = getPermitLic_View_res.xpath( '//span[contains(text(),"Owner:")]/ancestor::td//following-sibling::tr/td/text()' ).extract()[:-1] owner_address = ','.join(owner_add) il = ItemLoader(item=NvClarkBuildingPermitsSpiderItem(), response=getPermitLic_View_res) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('permit_type', 'Building_Permit') il.add_value('sourceName', 'WA_King_Seattle_Building_Permits') il.add_value( 'url', 'https://cosaccela.seattle.gov/portal/Cap/CapHome.aspx?module=DPDPermits&TabName=DPDPermits' ) il.add_value('permit_lic_no', _row.get('Permit Number', '')) il.add_value('permit_subtype', _row.get('Permit Type', '')) il.add_value('permit_lic_status', _row.get('Status', '')) il.add_value('project name', _row.get('Project Name', '')) il.add_value('permit_lic_eff_date', _row.get('Date', '')) il.add_value( 'permit_lic_desc', _row.get('Description') if _row.get('Description') else _row.get('Permit Type')) il.add_value('short notes', _row.get('Short Notes', '')) il.add_value('occupancy_subtype', occupancy_subtype) il.add_value('location_address_string', location_address_string) il.add_value('Conditions', condition) il.add_value('permit_lic_value', permit_lic_value) il.add_value('parcel number', parcel_number) il.add_value('mixed_contractor_name', contractor_name) il.add_value('mixed_contractor_address_string', contractor_address) il.add_value('mixed_contractor_phone', contractor_phone) il.add_value('mixed_name', owner_name) il.add_value('mixed_subtype', mixed_subtype) il.add_value('person_address_string', owner_address) il.add_value('mixed_phone', owner_phone) yield il.load_item() def split(self, strng, sep, pos): strng = strng.split(sep) return sep.join(strng[:pos]), sep.join(strng[pos:])
class KsVeterinaryboardLicensesSpider(CommonSpider): name = '1676_ks_veterinaryboard_licenses' allowed_domains = ['ks.gov'] start_urls = [ 'http://agriculture.ks.gov/divisions-programs/division-of-animal-health/kansas-board-of-veterinary-examiners/licensee-information' ] custom_settings = { 'FILE_NAME': Utils.getRundateFileName( 'AI_1676_Licenses_VeterinaryBoard_KS_CurationReady'), 'JIRA_ID': 'AI_1676', 'COOKIES_ENABLED': True, 'COOKIES_DEBUG': True, 'HTTPCACHE_ENABLED': False, # 'JOBDIR' : CustomSettings.getJobDirectory('ks_veterinaryboard_licenses'), 'TOP_HEADER': { 'company_name': 'VETERINARIAN/Contact', 'dba_name': '', 'location_address_string': '', 'permit_applied_date': 'YR Registd', 'permit_lic_desc': '', 'permit_lic_eff_date': 'DATE ISSUED', 'permit_type': '' }, 'FIELDS_TO_EXPORT': [ 'company_name', 'dba_name', 'location_address_string', 'permit_lic_eff_date', 'permit_applied_date', 'permit_lic_desc', 'permit_type', 'url', 'sourceName', 'ingestion_timestamp', ], 'NULL_HEADERS': [] } def parse(self, response): meta = {} file1 = 'https://kbve.kansas.gov/wp-content/uploads/2017/09/Current_RVTs.pdf' file2 = 'https://kbve.kansas.gov/wp-content/uploads/2017/09/currently_not_licensed.pdf' file3 = 'https://kbve.kansas.gov/wp-content/uploads/2017/09/newly_licensed.pdf' file4 = 'https://kbve.kansas.gov/wp-content/uploads/2017/09/currently_licensed.pdf' meta['company_name'] = meta['location_address_string'] = meta[ 'permit_applied_date'] = meta['permit_lic_desc'] = meta[ 'permit_lic_eff_date'] = '' df = tabula.read_pdf( file1, pages='all', area=[92.948, 49.725, 731.723, 530.145], # silent=True, guess=False, encoding='utf-8', pandas_option={'header': None}) for _, row in df.fillna('').iterrows(): row = row.tolist() meta['company_name'] = row[2] meta['location_address_string'] = 'KS' meta['permit_applied_date'] = row[3] meta['permit_lic_desc'] = 'Veterinary License for ' + meta[ 'company_name'] # print(meta['company_name']) yield self.save_to_csv(response, **meta).load_item() df = tabula.read_pdf(file2, pages='all', area=[79.178, 47.43, 731.723, 474.3], silent=True, guess=False, encoding='utf-8', pandas_option={'header': None}) for _, row in df.fillna('').iterrows(): row = row.tolist() meta['company_name'] = row[1] meta['permit_lic_eff_date'] = row[2] meta['location_address_string'] = 'KS' meta['permit_applied_date'] = '' meta['permit_lic_desc'] = 'Veterinary License for ' + meta[ 'company_name'] yield self.save_to_csv(response, **meta).load_item() df = tabula.read_pdf(file3, pages='all', area=[132.728, 52.02, 727.898, 445.23], silent=True, guess=False, encoding='utf-8', pandas_option={'header': None}) for _, row in df.fillna('').iterrows(): row = row.tolist() meta['company_name'] = row[1] meta['permit_lic_eff_date'] = row[2] meta['location_address_string'] = 'KS' meta['permit_applied_date'] = '' meta['permit_lic_desc'] = 'Veterinary License for ' + meta[ 'company_name'] yield self.save_to_csv(response, **meta).load_item() df = tabula.read_pdf(file4, pages='all', area=[92.948, 19.125, 750.083, 388.62], silent=True, guess=False, encoding='utf-8', pandas_option={'header': None}) for _, row in df.fillna('').iterrows(): row = row.tolist() meta['company_name'] = row[1] meta['permit_lic_eff_date'] = row[2] meta['location_address_string'] = 'KS' meta['permit_applied_date'] = '' meta['permit_lic_desc'] = 'Veterinary License for ' + meta[ 'company_name'] yield self.save_to_csv(response, **meta).load_item() def save_to_csv(self, response, **meta): # self.state['items_count'] = self.state.get('items_count', 0) + 1 il = ItemLoader(item=KsVeterinaryboardLicensesSpiderItem(), response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) #il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'KS_VeterinaryBoard_Licenses') il.add_value( 'url', 'http://agriculture.ks.gov/divisions-programs/division-of-animal-health/kansas-board-of-veterinary-examiners/licensee-information' ) il.add_value('company_name', self._getDBA(meta['company_name'])[0]) il.add_value('permit_lic_desc', meta['permit_lic_desc']) il.add_value('permit_applied_date', meta['permit_applied_date']) il.add_value('location_address_string', meta['location_address_string']) il.add_value('permit_lic_eff_date', meta['permit_lic_eff_date']) il.add_value('permit_type', 'veterinary_license') il.add_value('dba_name', self._getDBA(meta['company_name'])[1]) return il
class OhSosSpider(CommonSpider): name = '187_oh_sos' allowed_domains = ['state.oh.us'] start_urls = ['https://businesssearch.sos.state.oh.us/#busDialog'] custom_settings = { 'FILE_NAME': Utils.getRundateFileName('AI-187_Companies_SOS_OH_CurationReady'), 'JIRA_ID': 'AI_187', 'DOWNLOAD_DELAY': .2, 'COOKIES_ENABLED': True, 'COOKIES_DEBUG': True, 'HTTPCACHE_ENABLED': False, # 'JOBDIR' : CustomSettings.getJobDirectory('oh_sos'), 'TOP_HEADER': { 'company_name': 'Business Name', 'company_subtype': 'Filing Type', 'creation_date': '', 'dba_name': '', 'entity_id': 'Entity', 'inactive_date': '', 'location_address_string': 'Location+County+State', 'mixed_name': 'Agent/Incorporator Information', 'mixed_subtype': 'Agent/Incorporator', 'permit_lic_desc': '', 'permit_lic_exp_date': 'Exp Date', 'permit_type': '', 'person_address_string': 'Agent/Incorporator Address', 'status': 'Status' }, 'FIELDS_TO_EXPORT': [ 'entity_id', 'company_name', 'dba_name', 'company_subtype', 'creation_date', 'permit_lic_exp_date', 'status', 'location_address_string', 'mixed_name', 'mixed_subtype', 'person_address_string', 'permit_lic_desc', # 'inactive_date', 'permit_type', 'url', 'sourceName', 'ingestion_timestamp', ], 'NULL_HEADERS': ['status'] } def parse(self, response): rnd = lambda: str(int(round(time.time() * 1000))) self.searchkeys = [] if self.start.isalpha() and self.end.isalpha(): self.searchkeys = SearchCriteria.strRange(self.start, self.end) if 'AND' in self.searchkeys: self.searchkeys.remove('AND') if 'NOT' in self.searchkeys: self.searchkeys.remove('NOT') if 'BTI' in self.searchkeys: self.searchkeys.remove('BTI') else: self.searchkeys = SearchCriteria.numberRange( self.start, self.end, 1) # self.searchkeys=['INDUSTRIES'] print(len(self.searchkeys)) search = self.searchkeys.pop(0) # search='154 WEST RAYEN CORP' url = 'https://businesssearchapi.sos.state.oh.us/zyjLcCmoqeZffOn1ajJdsiek3tmuj9QtZVn{}_X?_={}'.format( str(search), rnd()) yield scrapy.Request(url=url, callback=self.parse_data, dont_filter=True, meta={'search': search}) self.ids = [] @inline_requests def parse_data(self, response): # inspect_response(response,self) rnd = lambda: str(int(round(time.time() * 1000))) json_res = json.loads(response.body_as_unicode()) if not isinstance(json_res, int): json_res = json.loads(response.body_as_unicode())['data'] if json.loads(response.body_as_unicode())['data']: module_dir = os.path.dirname(os.path.realpath(__file__)) path = module_dir + '/readme.txt' with open(path, 'a') as f: if json_res: f.write(response.meta['search'] + "|" + str(json_res[0]['result_count']) + "\n") else: f.write(response.meta['search'] + "|" + str('no record') + "\n") for data in json_res: if data.get('charter_num', ''): self.ids.append(data.get('charter_num', '')) if self.ids: id = self.ids.pop(0) mix_url = 'https://businesssearchapi.sos.state.oh.us/Rtj0lqmmno6vaBwbRxU7TvnJY6RmAt0bipK{}?_={}'.format( id, rnd) yield scrapy.Request( url=mix_url, dont_filter=True, callback=self.main_data, meta={'search': response.meta['search']}) else: if self.searchkeys: search = self.searchkeys.pop(0) url = 'https://businesssearchapi.sos.state.oh.us/zyjLcCmoqeZffOn1ajJdsiek3tmuj9QtZVn{}_X?_={}'.format( str(search), rnd()) yield scrapy.Request(url=url, callback=self.parse_data, dont_filter=True, meta={'search': search}) else: if self.searchkeys: search = self.searchkeys.pop(0) url = 'https://businesssearchapi.sos.state.oh.us/zyjLcCmoqeZffOn1ajJdsiek3tmuj9QtZVn{}_X?_={}'.format( str(search), rnd()) yield scrapy.Request(url=url, callback=self.parse_data, dont_filter=True, meta={'search': search}) def main_data(self, response): print("================>", response.meta['search']) # inspect_response(response,self) rnd = lambda: str(int(round(time.time() * 1000))) all_data = json.loads(response.body_as_unicode())['data'] data = all_data[4].get("firstpanel")[0] location_address_string = '' if data.get('business_location_name', '') and len(data.get('business_location_name', '')) > 2: location_address_string += data.get('business_location_name', '') + ", " if data.get('county_name', '') and len(data.get('county_name', '')) > 2: location_address_string += data.get('county_name', '') + ", " if data.get('state_name', '') and len(data.get('state_name', '')) > 2: location_address_string += data.get('state_name', '') else: location_address_string += 'OH' data_dic = {} data_dic['entity_id'] = data.get('charter_num', '') com_dba = self._getDBA(data.get('business_name', '')) data_dic['company_name'] = com_dba[0] if com_dba[0] else com_dba[1] data_dic['dba_name'] = com_dba[1] data_dic['company_subtype'] = data.get('business_type') if data.get( 'business_type') else '' data_dic['creation_date'] = self.format_date(data.get('effect_date')) data_dic['permit_lic_exp_date'] = self.format_date( data.get('expiry_date')) data_dic['status'] = data.get('status', '') data_dic[ 'location_address_string'] = location_address_string if location_address_string else 'OH' data_dic['permit_lic_desc'] = 'Business License for ' + data_dic[ 'company_name'] if data_dic['company_name'] else 'Business License' mix_json = all_data c = 0 for incorp in mix_json[3]['details']: c += 1 mix_dba = self._getDBA(incorp.get('business_assoc_name')) if mix_dba[1] and data_dic['dba_name']: yield self.save_csv(response, data_dic) data_dic['dba_name'] = '' else: data_dic['dba_name'] = data_dic['dba_name'] = mix_dba[ 1] if mix_dba[1] else data_dic['dba_name'] il = self.save_csv(response, data_dic) il.add_value('mixed_name', mix_dba[0]) il.add_value('mixed_subtype', 'Incorporator ') il.add_value('person_address_string', 'OH') yield il.load_item() for registrant in mix_json[1].get('registrant'): c += 1 person_address = self.format__address_4( registrant.get('contact_addr1') + str(', ' + registrant.get('contact_addr2') if len( registrant.get('contact_addr2').strip('-')) > 2 else ''), registrant.get('contact_city'), registrant.get('contact_state'), registrant.get('contact_zip9')) mix_dba = self._getDBA(registrant.get('contact_name')) if mix_dba[1] and data_dic['dba_name']: yield self.save_csv(response, data_dic) data_dic['dba_name'] = '' else: data_dic['dba_name'] = mix_dba[1] if mix_dba[1] else data_dic[ 'dba_name'] il = self.save_csv(response, data_dic) il.add_value('mixed_name', mix_dba[0]) il.add_value('mixed_subtype', 'Agent ') il.add_value('person_address_string', person_address) yield il.load_item() if c == 0: yield self.save_csv(response, data_dic).load_item() if self.ids: id = self.ids.pop(0) mix_url = 'https://businesssearchapi.sos.state.oh.us/Rtj0lqmmno6vaBwbRxU7TvnJY6RmAt0bipK{}?_={}'.format( id, rnd) yield scrapy.Request(url=mix_url, dont_filter=True, callback=self.main_data, meta={'search': response.meta['search']}) else: if self.searchkeys: search = self.searchkeys.pop(0) url = 'https://businesssearchapi.sos.state.oh.us/zyjLcCmoqeZffOn1ajJdsiek3tmuj9QtZVn{}_X?_={}'.format( str(search), rnd()) yield scrapy.Request(url=url, callback=self.parse_data, dont_filter=True, meta={'search': search}) def save_csv(self, response, data_dic): il = ItemLoader(item=OhSosSpiderItem(), response=response) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'OH_SOS') il.add_value( 'url', 'https://www5.sos.state.oh.us/ords/f?p=100:1:::NO:1:P1_TYPE:NAME') il.add_value('permit_type', 'business_license') for k in data_dic: il.add_value(k, data_dic[k]) return il
class IaJohnsonIowacityBuildingPermitsSpider(CommonSpider): name = '1083_ia_johnson_iowacity_building_permits' allowed_domains = ['iowa-city.org'] start_urls = ['http://www.iowa-city.org/IcgovApps/Tidemark/Search'] main_url = 'http://www.iowa-city.org' handle_httpstatus_list = [500] custom_settings = { 'FILE_NAME': Utils.getRundateFileName( 'AI-1083_Permits_Buildings_IA_Johnson_IowaCity_CurationReady'), 'JIRA_ID': 'AI_1083', 'DOWNLOAD_DELAY': 0.5, 'CONCURRENT_REQUESTS': 1, 'TRACKING_OPTIONAL_PARAMS': ['case_number'], 'COOKIES_ENABLED': True, 'COOKIES_DEBUG': True, 'HTTPCACHE_ENABLED': False, # 'JOBDIR' : CustomSettings.getJobDirectory('IaJohnsonIowacityBuildingPermitsSpider'), 'TOP_HEADER': { 'case action-notes': 'Case Action-Notes', 'case actions-date': 'Case Actions-Date', 'case actions-description': 'Case Actions-Description', 'case actions-status': 'Case Actions-Status', 'inspection_date': 'Case Actions-Date.1', 'inspection_description': '', 'inspection_pass_fail': 'Case Actions-Status.1', 'inspection_type': '', 'location_address_string': 'Address', 'permit_lic_desc': 'Description', 'permit_lic_no': 'Case Number', 'permit_lic_status': 'Status', 'permit_type': '', 'violation_date': '', 'violation_type': '' }, 'FIELDS_TO_EXPORT': [ 'permit_lic_no', 'permit_lic_status', 'location_address_string', 'permit_lic_desc', 'case actions-date', 'case actions-description', 'case actions-status', 'case action-notes', 'inspection_date', 'inspection_pass_fail', 'inspection_description', 'inspection_type', 'violation_date', 'violation_type', 'permit_type', 'sourceName', 'url', 'ingestion_timestamp' ], 'NULL_HEADERS': [ 'case actions-date', 'case actions-description', 'case actions-status', 'case action-notes' ] } SearchCriteria = [ 'ELE', 'ABN', 'BLD', 'CSR', 'DAC', 'DEM', 'DRC', 'EXC', 'FAP', 'FLD', 'FSP', 'HPC', 'MEC', 'PLM', 'PSD', 'WTR' ] check_first = True def parse(self, response): if self.check_first: self.check_first = False self.search_element = self.SearchCriteria[int(self.start):int(self. end)] if len(self.search_element) > 0: param = self.search_element.pop(0) form_data = { 'SearchTerms.CaseNumber': str(param), 'SearchTerms.CaseAddress': '' } next_url = 'http://www.iowa-city.org/IcgovApps/Tidemark/Search?Length=10' yield FormRequest(url=next_url, formdata=form_data, callback=self.parse_list, dont_filter=True) @inline_requests def parse_list(self, response): meta = {} meta['case_action_date'] = meta['inspection_date'] = meta[ 'inspection_type'] = meta['case_action_notes'] = meta[ 'case_action_status'] = meta['case_action_description'] = meta[ 'description'] = meta['address'] = meta['status'] = meta[ 'case_number'] = meta['inspection_pass_fail'] = meta[ 'inspection_description'] = meta[ 'violation_date'] = meta['violation_type'] = '' first_table = response.xpath( '/html/body/div[2]/table//tr')[16185:19001] for i in first_table: meta['case_action_date'] = meta['case_action_notes'] = meta[ 'case_action_status'] = meta['case_action_description'] = '' meta['case_number'] = i.xpath('td[1]/a/text()').extract_first() meta['status'] = i.xpath('td[2]/text()').extract_first() address = i.xpath('td[3]/text()').extract_first() if address: meta['address'] = address + ', IowaCity, IA' else: meta['address'] = 'IA' desc = self.data_clean(i.xpath('td[4]/text()').extract_first()) if desc: meta['description'] = desc else: meta['description'] = 'Building Permit' yield self.save_to_csv(response, **meta) number_link = i.xpath('td[1]/a/@href').extract_first() next_page = self.main_url + str(number_link) link = yield scrapy.Request( url=next_page, dont_filter=True, meta={'optional': { 'case_number': meta['case_number'] }}) status = link.status if status == 500: pass else: table = link.xpath('/html/body/div[2]/table[1]//tr')[1:] if table: for j in table: meta['inspection_date'] = meta[ 'inspection_type'] = meta[ 'inspection_pass_fail'] = meta[ 'inspection_description'] = meta[ 'violation_date'] = meta[ 'violation_type'] = '' meta['case_action_date'] = j.xpath( 'td[1]/text()').extract_first() meta['case_action_description'] = self.data_clean( j.xpath('td[2]/text()').extract_first()) meta['case_action_status'] = j.xpath( 'td[3]/text()').extract_first() meta['case_action_notes'] = j.xpath( 'td[4]/text()').extract() meta['case_action_notes'] = ' '.join( meta['case_action_notes']) if meta['case_action_description']: if 'Inspection' in meta[ 'case_action_description'] or 'Insp -' in meta[ 'case_action_description'] or 'Initial inspection' in meta[ 'case_action_description'] or 'Re-inspection' in meta[ 'case_action_description'] or 'inspection' in meta[ 'case_action_description']: meta['inspection_date'] = meta[ 'case_action_date'] meta['inspection_type'] = 'building_inspection' meta['inspection_pass_fail'] = meta[ 'case_action_status'] meta['inspection_description'] = meta[ 'case_action_notes'] meta['violation_date'] = meta[ 'violation_type'] = '' meta['case_action_status'] = meta[ 'case_action_notes'] = '' if 'VIOLATION' in meta[ 'case_action_description'] or 'violation' in meta[ 'case_action_description']: meta['violation_date'] = meta[ 'case_action_date'] meta['violation_type'] = 'building_violation' yield self.save_to_csv(response, **meta) else: yield self.save_to_csv(response, **meta) if len(self.search_element) > 0: yield scrapy.Request(url=self.start_urls[0], callback=self.parse, dont_filter=True) def save_to_csv(self, response, **meta): il = ItemLoader(item=IaJohnsonIowacityBuildingPermitsSpiderItem(), response=response) # il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'IA_Johnson_IowaCity_Building_Permits') il.add_value('url', 'http://www.iowa-city.org/IcgovApps/Tidemark/Search') il.add_value('permit_lic_no', meta['case_number']) il.add_value('permit_lic_status', meta['status']) il.add_value('location_address_string', meta['address']) il.add_value('permit_lic_desc', meta['description']) il.add_value('case actions-date', meta['case_action_date']) il.add_value('case actions-description', meta['case_action_description']) il.add_value('case actions-status', meta['case_action_status']) il.add_value('case action-notes', meta['case_action_notes']) il.add_value('inspection_date', meta['inspection_date']) il.add_value('inspection_type', meta['inspection_type']) il.add_value('inspection_pass_fail', meta['inspection_pass_fail']) il.add_value('inspection_description', meta['inspection_description']) il.add_value('violation_date', meta['violation_date']) il.add_value('violation_type', meta['violation_type']) il.add_value('permit_type', 'building_permit') return il.load_item() def data_clean(self, value): if value: try: clean_tags = re.compile('<.*?>') desc_list = re.sub('\s+', ' ', re.sub(clean_tags, '', value)) desc_list_rep = desc_list.replace('&', '&') return desc_list_rep.strip() except: return '' else: return ''
class MsSosSpider(CommonSpider): name = 'ms_sos' allowed_domains = ['ms.gov'] post_url = 'https://corp.sos.ms.gov/corpreporting/Dataset/PublicSearch' start_urls = ['https://corp.sos.ms.gov/corpreporting/Corp/BusinessSearch3'] custom_settings = { 'FILE_NAME': Utils.getRundateFileName('AI-1451_SOS_MS_CurationReady'), 'JIRA_ID': 'AI_1451', 'COOKIES_ENABLED': True, 'COOKIES_DEBUG': True, 'HTTPCACHE_ENABLED': False, 'JOBDIR': CustomSettings.getJobDirectory('ms_sos'), 'TOP_HEADER': { 'company_name': 'BusinessName', 'company_subtype': 'EntityType', 'county': 'County', 'creation_date': 'FormationDate', 'dba_name_': '', 'dba_name': 'OtherNames', 'domiciletype': 'DomicileType', 'entity_id': 'BusinessId', 'location_address_string': 'AddressLine+city_state+postal code', 'naics': 'NAICSCode1+NAICSCode2+NAICSCode3', 'non_profit_indicator': '', 'permit_type': '', 'status': 'Status' }, 'FIELDS_TO_EXPORT': [ 'entity_id', 'company_name', 'dba_name_', 'dba_name', 'company_subtype', 'non_profit_indicator', 'domiciletype', 'status', 'naics', 'location_address_string', 'county', 'creation_date', 'permit_type', 'sourceName', 'url', 'ingestion_timestamp' ], 'NULL_HEADERS': ['domiciletype', 'county'], 'REPLACE_VALUES': { 'dba_name': [{ 'value': 'null', 'replace': '' }], 'status': [{ 'value': 'Undefined', 'replace': '' }], 'company_name': [{ 'value': 'null', 'replace': '' }], 'county': [{ 'value': 'NONE', 'replace': '' }, { 'value': 'None', 'replace': '' }, { 'value': 'none', 'replace': '' }], 'location_address_string': [{ 'value': '', 'replace': 'MS' }, { 'value': 'N A, NA, MS NULL', 'replace': 'MS' }, { 'value': 'Unknown Address', 'replace': 'MS' }, { 'value': 'NONE', 'replace': 'MS' }, { 'value': 'None', 'replace': 'MS' }, { 'value': 'none', 'replace': 'MS' }, { 'value': 'N/A', 'replace': 'MS' }, { 'value': 'N/a', 'replace': 'MS' }, { 'value': 'n/a', 'replace': 'MS' }, { 'value': 'NA, NULL, MS NULL', 'replace': 'MS' }, { 'value': 'Unknown Address, MS', 'replace': 'MS' }, { 'value': 'UNKNOWN, MS', 'replace': 'MS' }] } } page_no = 1 page_no_list = [] is_first_page = True def format_address(self, address1, city, state, zip_code): return ", ".join([ y.strip() for y in [ address1, city, " ".join( [i.strip() for i in [state, zip_code] if i]) ] if y ]) def parse(self, response): form_data = { 'sort': 'BusinessName-asc', 'page': '1', 'pageSize': '20', 'group': '', 'filter': '' } yield scrapy.FormRequest(url=self.post_url, method="POST", formdata=form_data, callback=self.parse_details, dont_filter=True) def parse_details(self, response): jsonresponse = json.loads(response.text.strip()) print( '----------------------------------------------------------------') print( '---------------------------------------------------------------- Pageno: ', self.page_no) ## Calculate Total Pages if self.is_first_page == True: total_records = jsonresponse['Total'] if total_records is not None: if int(total_records) > 20: total_pages = int(total_records) / int(20) if (total_pages).is_integer() is True: total_pages = int(total_pages) else: total_pages = int(total_pages) + int(1) else: total_pages = int(1) else: total_records = 0 if total_pages == 1: self.page_no_list = [] else: for page in range(2, int(total_pages) + 1): self.page_no_list.append(str(page)) for i in range(0, len(jsonresponse['Data'])): name = jsonresponse['Data'][i]['BusinessName'] if name is not None and name.strip() != '': company_name = self._getDBA(name)[0] company_name = re.sub(r'[\(\[].*?[\)\]]', '', company_name) company_name = company_name.replace('-DBA', '').replace( '-Dba', '').replace('-dba', '').strip() dba_name = self._getDBA(name)[1] else: company_name = '' dba_name = '' if company_name is not None and company_name.strip() != '': company_name = company_name.strip() elif dba_name is not None and dba_name.strip() != '': company_name = dba_name.strip() else: company_name = '' other_name = jsonresponse['Data'][i]['OtherNames'] if other_name is not None and other_name.strip() != '': other_name = re.sub(r'[\(\[].*?[\)\]]', '', other_name) other_name = re.sub(r' DBA| dba| Dba|dba |DBA', '', other_name) other_name = re.sub(r'\s+', ' ', other_name.strip()) other_name = other_name.replace(' ;', ';').replace( '-DBA', '').replace('-Dba', '').replace('-dba', '').strip() else: other_name = '' if 'Non-Profit' in jsonresponse['Data'][i]['EntityType']: indicator = 'Yes' else: indicator = '' naics_code1 = jsonresponse['Data'][i]['NAICSCode1'] naics_code2 = jsonresponse['Data'][i]['NAICSCode2'] naics_code3 = jsonresponse['Data'][i]['NAICSCode3'] naics_code123 = (naics_code1 if naics_code1 else '') + ( '; ' if naics_code1 and naics_code2 else '') + (naics_code2 if naics_code2 else '') + ('; ' if naics_code2 and naics_code3 else '') + (naics_code3 if naics_code3 else '') address = jsonresponse['Data'][i]['AddressLine1'] if address is not None and address.strip() != '' and str( address.strip()).lower() != 'null': address = address.replace('N/A, N/A', '').replace( 'N/A', '').replace('n/a', '').replace('N/a', '').replace( 'NONE', '').replace('None', '').replace('none', '').replace( 'NONE SHOWN', '').replace('NONE AT THIS TIME', '').strip() else: address = '' city = jsonresponse['Data'][i]['City'] if city is not None and city.strip() != '' and str( city.strip()).lower() != 'null': city = city.replace('N/A', '').replace('n/a', '').replace( 'N/a', '').replace('none', '').strip() else: city = '' state = jsonresponse['Data'][i]['StateCode'] if state is not None and state.strip() != '' and str( state.strip()).lower() != 'null': state = state.replace('N/A', '').replace('n/a', '').replace('N/a', '').strip() else: state = 'MS' zip_code = jsonresponse['Data'][i]['PostalCode'] if zip_code is not None and zip_code.strip() != '' and str( zip_code.strip()).lower() != 'null': zip_code = zip_code.replace('N/A', '').replace( 'n/a', '').replace('N/a', '').replace('none', '').strip() else: zip_code = '' address = self.format_address(address, city, state, zip_code) if jsonresponse['Data'][i]['FormationDate'] is not None: TimestampUtc = jsonresponse['Data'][i]['FormationDate'] TimestampUtc = re.split(r'\(|\)', TimestampUtc)[1] creation_date = datetime.datetime( 1970, 1, 1) + datetime.timedelta(seconds=(int(TimestampUtc) / 1000)) creation_date = self.format_date(creation_date) else: creation_date = '' il = ItemLoader(item=MsSosSpiderItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) il.add_value('sourceName', 'MS_SOS') il.add_value( 'url', 'https://corp.sos.ms.gov/corpreporting/Corp/BusinessSearch3') il.add_value('entity_id', jsonresponse['Data'][i]['BusinessId']) il.add_value('company_name', company_name) il.add_value('dba_name_', dba_name) il.add_value('dba_name', other_name) il.add_value('company_subtype', jsonresponse['Data'][i]['EntityType']) il.add_value('non_profit_indicator', indicator) il.add_value('domiciletype', jsonresponse['Data'][i]['DomicileType']) il.add_value('status', jsonresponse['Data'][i]['Status']) il.add_value('naics', naics_code123) il.add_value('location_address_string', address) il.add_value('county', jsonresponse['Data'][i]['County']) il.add_value('creation_date', str(creation_date)) il.add_value('permit_type', 'business_license') yield il.load_item() if len(self.page_no_list) > 0: self.is_first_page = False next_page = self.page_no_list.pop(0) self.page_no = str(next_page) form_data = { 'sort': 'BusinessName-asc', 'page': str(next_page), 'pageSize': '20', 'group': '', 'filter': '' } yield scrapy.FormRequest(url=self.post_url, method="POST", formdata=form_data, callback=self.parse_details, dont_filter=True)
class GaHenryBuildingPermitsSpider(CommonSpider): name = '337_ga_henry_building_permits' allowed_domains = ['sagesgov.com'] start_urls = ['https://www.sagesgov.com/henrycounty-ga/Portal/Search.aspx'] custom_settings = { 'FILE_NAME': Utils.getRundateFileName( 'AI-337_Permits_Buildings_GA_Henry_CurationReady'), 'JIRA_ID': 'AI_337', 'COOKIES_ENABLED': True, 'DOWNLOAD_DELAY': 10, 'CONCURRENT_REQUESTS': 1, 'COOKIES_DEBUG': True, 'HTTPCACHE_ENABLED': False, # 'JOBDIR' : CustomSettings.getJobDirectory('ga_henry_building_permits'), 'TOP_HEADER': { 'contractor_address_string': 'General Contractor Address', 'contractor_email': '', 'contractor_phone': '', 'location_address_string': 'Address', 'mixed_contractor_name': 'General Contractor/company', 'mixed_email': 'Contacts Email', 'mixed_name': 'Contacts Name/company', 'mixed_phone': 'Contacts Phone', 'mixed_subtype': 'Contacts Type', 'parcel number': 'Parcel Number', 'permit_applied_date': 'SUBMITTED ON', 'permit_lic_desc': 'Project/Case Name', 'permit_lic_no': 'Project/Case #', 'permit_lic_status': 'Project/Case Status', 'permit_subtype': 'Process Type', 'permit_type': '', 'person_address_string': 'Contacts Address', 'project/case coordinator': 'Project/Case Coordinator', 'project/case coordinator phone': 'Project/Case Coordinator Phone', 'submitted by': 'Submitted By' }, 'FIELDS_TO_EXPORT': [ 'permit_lic_no', 'permit_lic_desc', 'permit_lic_status', 'permit_subtype', 'location_address_string', 'submitted by', 'permit_applied_date', 'parcel number', 'project/case coordinator', 'project/case coordinator phone', 'mixed_name', 'mixed_subtype', 'person_address_string', 'mixed_email', 'mixed_phone', 'mixed_contractor_name', 'contractor_address_string', 'contractor_email', 'contractor_phone', 'permit_type', 'sourceName', 'url', 'ingestion_timestamp', ], 'NULL_HEADERS': [ 'submitted by', 'project/case coordinator', 'parcel number', 'project/case coordinator phone' ] } def parse(self, response): self.remove_tag = lambda data: re.sub(r'\s+', ' ', data).replace( '[\n\t\r"]', '').strip() if data else '' # self.street=[i for i in range(0,10000)] self.street = [str(i) + "%" for i in range( 0, 100, )] self.streetnumber = self.street[int(self.start):int(self.end)] print(self.streetnumber) if self.streetnumber: self.number = self.streetnumber.pop(0) # print("\n",self.number,"\n") key = '6Lf2Y4MUAAAAAJhd44kibO_8-rrh1JpGS8pa81jZ' formdata = { '__EVENTTARGET': '', '__EVENTARGUMENT': '', '__LASTFOCUS': '', '__VIEWSTATE': response.xpath( '//*[@id="__VIEWSTATE"]/@value').extract_first(), '__VIEWSTATEGENERATOR': response.xpath( '//*[@id="__VIEWSTATEGENERATOR"]/@value').extract_first(), '__EVENTVALIDATION': response.xpath( '//*[@id="__EVENTVALIDATION"]/@value').extract_first(), 'ctl00$ctl00$cphContent$cphMain$Search1$ddlClass': '1001', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$tbInstanceNumber': '', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$tbProjectName': '', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$rptrDateFilter$ctl01$tfddlDateFilter$ddlTimeframe': '', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$rptrDateFilter$ctl02$tfddlDateFilter$ddlTimeframe': '', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$rptrDateFilter$ctl03$tfddlDateFilter$ddlTimeframe': '', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$rptrDateFilter$ctl04$tfddlDateFilter$ddlTimeframe': '', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$rptrDateFilter$ctl05$tfddlDateFilter$ddlTimeframe': '', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$tbStreetNumber': str(self.number), 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$tbStreetName': '', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$ddlStreetType': '', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$tbCity': '', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$tbParcelNumber': '', 'g-recaptcha-response': self.getcaptchaCoder(key).resolver(response.url), 'ctl00$ctl00$cphContent$cphMain$ctrlCaptcha$txtCaptchaToken': self.getcaptchaCoder(key).resolver(response.url), 'ctl00$ctl00$cphContent$cphMain$btnSearch': 'Search' } header = { "Accept-Encoding": "gzip, deflate, br", "Host": "www.sagesgov.com", "Origin": "https://www.sagesgov.com", "Referer": "https://www.sagesgov.com/henrycounty-ga/Portal/Search.aspx", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36" } yield scrapy.FormRequest.from_response( response, url= 'https://www.sagesgov.com/henrycounty-ga/Portal/Search.aspx', method="POST", formdata=formdata, headers=header, dont_filter=True, callback=self.parse_next, meta={'page': 1}) @inline_requests def parse_next(self, response): pageno = response.meta['page'] now = datetime.datetime.now() # print(now) with open( os.path.dirname(os.path.realpath(__file__)) + '/parse_number.txt', 'a') as f: f.write(str(self.number)) f.write('\n') #data scrape coding.................. # print('\n\n\n','pagenumber: '+str(self.number)) dic = {} table = response.xpath( "//table[@id='cphContent_cphMain_SearchResults1_gvSearchResults']//tr" )[1:-1] if table: for row in table: i = row.xpath(".//td/a/@href").extract_first() if 'java' not in i: link = 'https://www.sagesgov.com/henrycounty-ga/Portal/' + i print('\n\n') print("link: ", link) print('\n\n') dic['permit_applied_date'] = self.format_date1( row.xpath('.//td[6]/text()').extract_first()) scrape_data = yield scrapy.Request(url=link, dont_filter=True) # inspect_response(scrape_data,self) dic['permit_lic_no'] = scrape_data.xpath( "//div[contains(text(),'Project/Case')]/following-sibling::div/text()" ).extract_first() dic['permit_lic_desc'] = scrape_data.xpath( "//div[contains(text(),'Project/Case Name')]/following-sibling::div/text()" ).extract_first() dic['permit_lic_status'] = scrape_data.xpath( "//div[contains(text(),'Status')]/following-sibling::div/text()" ).extract_first() dic['permit_subtype'] = scrape_data.xpath( "//div[contains(text(),'Process Type')]/following-sibling::div/text()" ).extract_first() dic['location_address'] = scrape_data.xpath( "//div[contains(text(),'Address')]/following-sibling::div/text()" ).extract_first() if dic['location_address']: if 'Georgia' in dic['location_address']: d = dic['location_address'].strip().replace( " Georgia", '').split(" ") d.insert(-1, ", GA") # print(d) dic['location_address_string'] = ( ' '.join(d).replace(" ,", ',')) else: dic['location_address_string'] = dic[ 'location_address'] else: dic['location_address_string'] = "GA" dic['submitted by'] = scrape_data.xpath( "//div[contains(text(),'Submitted By')]/following-sibling::div/text()" ).extract_first() dic['parcel number'] = scrape_data.xpath( "//div[contains(text(),'Parcel #')]/following-sibling::div/text()" ).extract_first() coordinator = scrape_data.xpath( "//div[contains(text(),'Project/Case Coordinator')]/following-sibling::div/text()" ).extract_first() dic['project/case coordinator'] = '' dic['project/case coordinator phone'] = '' if coordinator: p = re.findall(r'.\d+.', coordinator) dic['project/case coordinator phone'] = ''.join(p) if dic['project/case coordinator phone']: dic['project/case coordinator'] = coordinator.replace( dic['project/case coordinator phone'], '') else: dic['project/case coordinator'] = coordinator dic['project/case coordinator phone'] = '' # print(dic) contacts = scrape_data.xpath( "//h3[contains(text(),'Contacts')]/following::div[1]//tr" )[1:] # inspect_response(scrape_data,self) if contacts: for contact in contacts: if 'General Contractor' == contact.xpath( ".//th/text()").extract_first(): dic['mixed_subtype'] = '' dic['mixed_name'] = '' dic['person_address_string'] = '' dic['mixed_email'] = '' dic['mixed_phone'] = '' dic['mixed_contractor_name'] = contact.xpath( ".//td[1]/text()").extract_first() dic['contractor_address_string'] = self.address( contact.xpath( ".//td[5]/text()").extract_first()) dic['contractor_email'] = contact.xpath( ".//td[3]/text()").extract_first() dic['contractor_phone'] = contact.xpath( ".//td[4]/text()").extract_first() yield self.save_csv(response, dic).load_item() else: pass if 'General Contractor' != contact.xpath( ".//th/text()").extract_first(): dic['mixed_subtype'] = contact.xpath( ".//th/text()").extract_first() dic['mixed_name'] = contact.xpath( ".//td[1]/text()").extract_first() dic['person_address_string'] = self.address( contact.xpath( ".//td[5]/text()").extract_first()) dic['mixed_email'] = contact.xpath( ".//td[3]/text()").extract_first() dic['mixed_phone'] = contact.xpath( ".//td[4]/text()").extract_first() dic['mixed_contractor_name'] = '' dic['contractor_address_string'] = '' dic['contractor_email'] = '' dic['contractor_phone'] = '' yield self.save_csv(response, dic).load_item() else: yield self.save_csv(response, dic).load_item() #pagination....................... nxt_link = response.xpath( "//td/span[contains(text(),'" + str(pageno) + "')]/following::td/a/@href").extract_first() if nxt_link: print("next page link: ", nxt_link, '\n\n') print('next page no : ', response.meta['page'] + 1) form_args_pagn = JavaScriptUtils.getValuesFromdoPost(nxt_link) formdata = { "ctl00$ctl00$ScriptManager1": 'ctl00$ctl00$cphContent$cphMain$UpdatePanel1|ctl00$ctl00$cphContent$cphMain$SearchResults1$gvSearchResults', "__EVENTTARGET": form_args_pagn['__EVENTTARGET'], "__EVENTARGUMENT": form_args_pagn['__EVENTARGUMENT'], "__VIEWSTATE": response.xpath( '//*[@id="__VIEWSTATE"]/@value').extract_first(), "__VIEWSTATEGENERATOR": response.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value'). extract_first(), "__EVENTVALIDATION": response.xpath( '//*[@id="__EVENTVALIDATION"]/@value').extract_first(), # "__ASYNCPOST" : "true" } headers = { "Host": "www.sagesgov.com", "Origin": "https://www.sagesgov.com", "Referer": "https://www.sagesgov.com/henrycounty-ga/Portal/SearchResults.aspx", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36", # "X-MicrosoftAjax": "Delta=true", "Content-Type": "application/x-www-form-urlencoded; charset=utf-8", "X-Requested-With": "XMLHttpRequest" } yield scrapy.FormRequest( url= 'https://www.sagesgov.com/henrycounty-ga/Portal/SearchResults.aspx', headers=headers, formdata=formdata, callback=self.parse_next, dont_filter=True, meta={'page': response.meta['page'] + 1}) else: print("<<<<<<<<<<<no next page in link>>>>>>>>>>>") else: now = datetime.datetime.now() print(now) with open( os.path.dirname(os.path.realpath(__file__)) + '/nodata.txt', 'a') as f: f.write(str(self.number)) f.write('\n') print( "<<<<<<<<<<<<<<<<<<<<<<<<<<<NO result in this page>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>", self.number) if self.streetnumber: self.number = self.streetnumber.pop(0) # print("next: ","\n",self.number) key = '6Lf2Y4MUAAAAAJhd44kibO_8-rrh1JpGS8pa81jZ' formdata = { '__EVENTTARGET': '', '__EVENTARGUMENT': '', '__LASTFOCUS': '', '__VIEWSTATE': response.xpath( '//*[@id="__VIEWSTATE"]/@value').extract_first(), '__VIEWSTATEGENERATOR': response.xpath( '//*[@id="__VIEWSTATEGENERATOR"]/@value').extract_first(), '__EVENTVALIDATION': response.xpath( '//*[@id="__EVENTVALIDATION"]/@value').extract_first(), 'ctl00$ctl00$cphContent$cphMain$Search1$ddlClass': '1001', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$tbInstanceNumber': '', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$tbProjectName': '', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$rptrDateFilter$ctl01$tfddlDateFilter$ddlTimeframe': '', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$rptrDateFilter$ctl02$tfddlDateFilter$ddlTimeframe': '', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$rptrDateFilter$ctl03$tfddlDateFilter$ddlTimeframe': '', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$rptrDateFilter$ctl04$tfddlDateFilter$ddlTimeframe': '', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$rptrDateFilter$ctl05$tfddlDateFilter$ddlTimeframe': '', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$tbStreetNumber': str(self.number), 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$tbStreetName': '', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$ddlStreetType': '', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$tbCity': '', 'ctl00$ctl00$cphContent$cphMain$Search1$SearchOrViewFilters1$tbParcelNumber': '', 'g-recaptcha-response': self.getcaptchaCoder(key).resolver(response.url), 'ctl00$ctl00$cphContent$cphMain$ctrlCaptcha$txtCaptchaToken': self.getcaptchaCoder(key).resolver(response.url), 'ctl00$ctl00$cphContent$cphMain$btnSearch': 'Search' } header = { "Accept-Encoding": "gzip, deflate, br", "Host": "www.sagesgov.com", "Origin": "https://www.sagesgov.com", "Referer": "https://www.sagesgov.com/henrycounty-ga/Portal/Search.aspx", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36" } yield scrapy.FormRequest.from_response( response, url= 'https://www.sagesgov.com/henrycounty-ga/Portal/Search.aspx', method="POST", formdata=formdata, headers=header, dont_filter=True, callback=self.parse_next, meta={'page': response.meta['page']}) def save_csv(self, response, data_dic): il = ItemLoader(item=GaHenryBuildingPermitsSpiderItem(), response=response) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'GA_Henry_Building_Permits') il.add_value( 'url', 'https://www.sagesgov.com/henrycounty-ga/Portal/Search.aspx') il.add_value('permit_type', 'building_permit') for k in data_dic: il.add_value(k, (data_dic[k])) return il def address(self, address): if address.strip(): d = address.strip().split(" ") d.insert(-2, ",") # print(d) return (' '.join(d).replace(" ,", ',')) else: return '' def format_date1(self, s): if s: dat = datetime.datetime.strptime(s, '%b %d, %Y') dat1 = datetime.datetime.strftime(dat, '%m/%d/%Y') return dat1 else: return ''
class RiSepticSystemLicensesSpider(CommonSpider): name = '1259_ri_septic_system_licenses' allowed_domains = ['ri.gov'] start_urls = ['https://www.ri.gov/DEM/isdssearch/'] custom_settings = { 'FILE_NAME':Utils.getRundateFileName('AI-1259_Licenses_RI_Septic_System_CurationReady'), 'JIRA_ID':'AI_1259', 'DOWNLOAD_DELAY':.2, 'COOKIES_ENABLED':True, 'COOKIES_DEBUG':True, 'HTTPCACHE_ENABLED':False, # 'JOBDIR' : CustomSettings.getJobDirectory('ri_septic_system_licenses'), 'TOP_HEADER':{ 'city/town': 'City/Town', 'company_name': 'Corp Owner', 'dba_name': '', 'location_address_string': 'Location', 'lot': 'lot', 'permit_lic_desc': '', 'permit_lic_no': 'Application number', 'permit_type': '', 'person_name': 'Designer Name', 'plat': 'Plat', 'sublot': 'sublot', 'total flow': 'Total Flow' }, 'FIELDS_TO_EXPORT':['permit_lic_no', 'city/town', 'location_address_string', 'plat', 'lot', 'sublot', 'company_name', 'dba_name', 'person_name', 'total flow', 'permit_lic_desc', 'permit_type', 'url', 'sourceName', 'ingestion_timestamp'], 'NULL_HEADERS':['city/town', 'plat', 'lot', 'sublot','total flow' ] } # @inline_requests def parse(self, response): self.rem_lis=lambda lis:[dd.replace('[\n\t\r]','').strip() for dd in lis] print('start = ',self.start,' end = ',self.end) option = response.xpath("//select[@id='town']/option/@value").extract()[int(self.start):int(self.end)] print('======>city : ',option) self.pos=0 self.city=option self.url = 'https://www.ri.gov/DEM/isdssearch/index.php?town=' yield scrapy.Request(url=self.url + self.city[self.pos], dont_filter=True, meta={'city':self.city[self.pos]},callback=self.parse_data) @inline_requests def parse_data(self, response): print('============> ',response.meta['city'],' : ',response.xpath('//p/text()').extract()) rem_esc=lambda data: re.sub('\s+',' ',data.replace('[\n\t\r]','').strip()) if data else '' view_links = response.xpath("//td/div/a/@href").extract() permit_lic=response.xpath("//td/a/text()").extract() if response.xpath("//td/div/a/@href").extract(): pass for link,permit_lic_no in zip(view_links,permit_lic): main_res=yield scrapy.Request(url='https://www.ri.gov/DEM/isdssearch/'+link.strip()) location_address_string=rem_esc(main_res.xpath("//em[contains(text(),'Location')]/following::text()").extract_first()) plat_lot=rem_esc(''.join(main_res.xpath('//em[contains(text(),"Plat")]/following::text()').extract()[:2])) Owner_name=rem_esc(main_res.xpath("//em[contains(text(),'Owner Name')]/following::text()").extract_first()) corp_owner=rem_esc(main_res.xpath("//em[contains(text(),'Corp Owner')]/following::text()").extract_first()) designer=rem_esc(main_res.xpath("//em[contains(text(),'Designer')]/following::text()").extract_first()) total=rem_esc(main_res.xpath("//em[contains(text(),'Total')]/following::text()").extract_first()) plat='' lot='' sublot='' if plat_lot: if 'Plat' in plat_lot and 'Lot' in plat_lot and 'Sublot' in plat_lot : plat=re.search('Plat.*Lot',plat_lot).group()[4:-3].strip() lot= re.search('Lot.*Sublot',plat_lot).group()[3:-6].strip() sublot=re.search('Sublot.*',plat_lot).group()[6:].strip() elif 'Plat' in plat_lot and 'Lot' in plat_lot : plat=re.search('Plat.*Lot',plat_lot).group()[4:-3].strip() lot= re.search('Lot.*',plat_lot).group()[3:].strip() elif 'Plat' in plat_lot and 'Sublot' in plat_lot : plat=re.search('Plat.*Sublot',plat_lot).group()[4:-6].strip() lot= re.search('Sublot.*',plat_lot).group()[6:].strip() elif 'plat' in plat_lot.lower(): plat=re.search('Plat.*',plat_lot).group()[4:].strip() il = ItemLoader(item=RiSepticSystemLicensesSpiderItem(),response=response) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'RI_Septic_System_Licenses') il.add_value('url', 'https://www.ri.gov/DEM/isdssearch/') il.add_value('permit_lic_no', permit_lic_no) il.add_value('city/town', response.meta['city']) il.add_value('location_address_string', location_address_string.strip()+", RI") il.add_value('plat', (plat.upper().strip())[:-1] if plat.endswith('&') else plat.upper()) il.add_value('lot', (lot.upper().strip())[:-1] if lot.endswith('&') else lot.upper()) il.add_value('sublot', (sublot.upper().strip())[:-1] if sublot.endswith('&') else sublot.upper()) company_name=corp_owner if corp_owner.strip() else Owner_name if Owner_name.strip() else designer if designer.strip() else '' com_name=self._getDBA(company_name) designer_dba=self._getDBA(designer) permit_lic_desc='Septic System Licenses' if com_name[0]: permit_lic_desc+=" For "+com_name[0] il.add_value('company_name', com_name[0] if com_name[0].strip() else designer_dba[0]) il.add_value('dba_name', com_name[1] if com_name[1] else designer_dba[1]) il.add_value('person_name', designer) il.add_value('total flow','' if 'Not available' in total else total) il.add_value('permit_lic_desc', permit_lic_desc) il.add_value('permit_type', 'utility_license') yield il.load_item() elif response.xpath('//h2/text()').extract_first(): permit_lic_no=(response.xpath('//h2/text()').extract_first()).split('#')[1] main_res=response location_address_string=rem_esc(main_res.xpath("//em[contains(text(),'Location')]/following::text()").extract_first()) plat_lot=rem_esc(''.join(main_res.xpath('//em[contains(text(),"Plat")]/following::text()').extract()[:2])) Owner_name=rem_esc(main_res.xpath("//em[contains(text(),'Owner Name')]/following::text()").extract_first()) corp_owner=rem_esc(main_res.xpath("//em[contains(text(),'Corp Owner')]/following::text()").extract_first()) designer=rem_esc(main_res.xpath("//em[contains(text(),'Designer')]/following::text()").extract_first()) total=rem_esc(main_res.xpath("//em[contains(text(),'Total')]/following::text()").extract_first()) plat='' lot='' sublot='' if plat_lot: if 'Plat' in plat_lot and 'Lot' in plat_lot and 'Sublot' in plat_lot : plat=re.search('Plat.*Lot',plat_lot).group()[4:-3].strip() lot= re.search('Lot.*Sublot',plat_lot).group()[3:-6].strip() sublot=re.search('Sublot.*',plat_lot).group()[6:].strip() elif 'Plat' in plat_lot and 'Lot' in plat_lot : plat=re.search('Plat.*Lot',plat_lot).group()[4:-3].strip() lot= re.search('Lot.*',plat_lot).group()[3:].strip() elif 'Plat' in plat_lot and 'Sublot' in plat_lot : plat=re.search('Plat.*Sublot',plat_lot).group()[4:-6].strip() lot= re.search('Sublot.*',plat_lot).group()[6:].strip() elif 'plat' in plat_lot.lower(): plat=re.search('Plat.*',plat_lot).group()[4:].strip() il = ItemLoader(item=RiSepticSystemLicensesSpiderItem(),response=response) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'RI_Septic_System_Licenses') il.add_value('url', 'https://www.ri.gov/DEM/isdssearch/') il.add_value('permit_lic_no', permit_lic_no) il.add_value('city/town', response.meta['city']) il.add_value('location_address_string', location_address_string.strip()+", RI") il.add_value('plat', (plat.upper().strip())[:-1] if plat.endswith('&') else plat.upper()) il.add_value('lot', (lot.upper().strip())[:-1] if lot.endswith('&') else lot.upper()) il.add_value('sublot', (sublot.upper().strip())[:-1] if sublot.endswith('&') else sublot.upper()) company_name=corp_owner if corp_owner.strip() else Owner_name if Owner_name.strip() else designer if designer.strip() else '' com_name=self._getDBA(company_name) designer_dba=self._getDBA(designer) permit_lic_desc='Septic System Licenses' if com_name[0]: permit_lic_desc+=" For "+com_name[0] il.add_value('company_name', com_name[0] if com_name[0].strip() else designer_dba[0]) il.add_value('dba_name',com_name[1] if com_name[1] else designer_dba[1]) il.add_value('person_name', designer_dba[0]) il.add_value('total flow','' if 'Not available' in total else total) il.add_value('permit_lic_desc', permit_lic_desc) il.add_value('permit_type', 'utility_license') yield il.load_item() if response.xpath('//span[@class="nextButton"]/a/@href').extract_first(): yield scrapy.Request(url='https://www.ri.gov/DEM/isdssearch/index.php'+response.xpath('//span/a/@href').extract_first(), meta={'city':response.meta['city']},callback=self.parse_data) elif len(self.city)> self.pos+1: self.pos+=1 print('====pos',self.pos) yield scrapy.Request(url=self.url + self.city[self.pos], dont_filter=True, meta={'city':self.city[self.pos]},callback=self.parse_data) # @inline_requests def save_csv(self,response,main_res,permit_lic_no): location_address_string=rem_esc(main_res.xpath("//em[contains(text(),'Location')]/following::text()").extract_first()) plat_lot=rem_esc(''.join(main_res.xpath('//em[contains(text(),"Plat")]/following::text()').extract()[:2])) Owner_name=rem_esc(main_res.xpath("//em[contains(text(),'Owner Name')]/following::text()").extract_first()) corp_owner=rem_esc(main_res.xpath("//em[contains(text(),'Corp Owner')]/following::text()").extract_first()) designer=rem_esc(main_res.xpath("//em[contains(text(),'Designer')]/following::text()").extract_first()) total=rem_esc(main_res.xpath("//em[contains(text(),'Total')]/following::text()").extract_first()) plat='' lot='' sublot='' if plat_lot: if 'Plat' in plat_lot and 'Lot' in plat_lot and 'Sublot' in plat_lot : plat=re.search('Plat.*Lot',plat_lot).group()[4:-3].strip() lot= re.search('Lot.*Sublot',plat_lot).group()[3:-6].strip() sublot=re.search('Sublot.*',plat_lot).group()[6:].strip() elif 'Plat' in plat_lot and 'Lot' in plat_lot : plat=re.search('Plat.*Lot',plat_lot).group()[4:-3].strip() lot= re.search('Lot.*',plat_lot).group()[3:].strip() elif 'Plat' in plat_lot and 'Sublot' in plat_lot : plat=re.search('Plat.*Sublot',plat_lot).group()[4:-6].strip() lot= re.search('Sublot.*',plat_lot).group()[6:].strip() elif 'plat' in plat_lot.lower(): plat=re.search('Plat.*',plat_lot).group()[4:].strip() il = ItemLoader(item=RiSepticSystemLicensesSpiderItem(),response=response) il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'RI_Septic_System_Licenses') il.add_value('url', 'https://www.ri.gov/DEM/isdssearch/') il.add_value('permit_lic_no', permit_lic_no) il.add_value('city/town', response.meta['city']) il.add_value('location_address_string', location_address_string.strip()+", RI") il.add_value('plat', (plat.upper().strip())[:-1] if plat.endswith('&') else plat.upper()) il.add_value('lot', (lot.upper().strip())[:-1] if lot.endswith('&') else lot.upper()) il.add_value('sublot', (sublot.upper().strip())[:-1] if sublot.endswith('&') else sublot.upper()) company_name=corp_owner if corp_owner.strip() else Owner_name if Owner_name.strip() else designer if designer.strip() else '' com_name=self._getDBA(company_name) designer_dba=self._getDBA(designer) permit_lic_desc='Septic System Licenses' if com_name[0]: permit_lic_desc+=" For "+com_name[0] il.add_value('company_name', com_name[0] if company[0].strip() else designer[0]) il.add_value('dba_name', com_name[1] if com_name[1] else designer[1]) il.add_value('person_name', designer[0]) il.add_value('total flow','' if 'Not available' in total else total) il.add_value('permit_lic_desc', permit_lic_desc) il.add_value('permit_type', 'utility_license') yield il.load_item()
class MoStlouisContractorLicensesSpider(CommonSpider): name = '1264_mo_stlouis_contractor_licenses' allowed_domains = ['stlouisco.com'] start_urls = ['https://www.stlouisco.com/YourGovernment/PublicWorks/Licensing'] custom_settings = { 'FILE_NAME':Utils.getRundateFileName('AI-1264_Licenses_StLouis_Contractor_MO_CurationReady'), 'JIRA_ID':'AI_1264', 'HTTPCACHE_ENABLED':False, 'COOKIES_ENABLED':True, 'DOWNLOAD_DELAY':0.1, 'COOKIES_DEBUG':True, 'HTTPCACHE_ENABLED':False, # 'JOBDIR' : CustomSettings.getJobDirectory('mo_stlouis_contractor_licenses'), 'TOP_HEADER':{'company_name': 'COMPANY NAME','company_phone': 'PHONE/Contact #','dba_name': '','location_address_string': 'BUSINESS LOCATION','mail_address_string': 'Mailing Address','permit_lic_desc': '','permit_subtype': 'Type','permit_type': '','person_name': 'LICENSE HOLDER','person_subtype': '','type/licensed contractors': 'TYPE/Licensed Contractors'}, 'FIELDS_TO_EXPORT':['permit_subtype','company_name','dba_name','person_name','person_subtype','type/licensed contractors','location_address_string','mail_address_string','company_phone','permit_lic_desc','permit_type','sourceName','url','ingestion_timestamp'], 'NULL_HEADERS':['type/licensed contractors'] } @inline_requests def parse(self, response): file1=response.xpath('//*[@id="dnn_ctr7787_HtmlModule_lblContent"]/div/p[1]/a/@href').extract_first() file2=response.xpath('//*[@id="dnn_ctr7787_HtmlModule_lblContent"]/div/p[3]/a/@href').extract_first() file3=response.xpath('//*[@id="dnn_ctr7787_HtmlModule_lblContent"]/div/p[5]/a/@href').extract_first() file4=response.xpath('//*[@id="dnn_ctr7787_HtmlModule_lblContent"]/div/p[7]/a/@href').extract_first() if file1: file2='elec' yield scrapy.Request(url='https://www.stlouisco.com/Portals/8/docs/document%20library/public%20works/code%20enforcement/licenses/elect/Elec-Contr.pdf',callback=self.pdf_content,dont_filter=True,meta={'file2':file2}) if file2: file2='mech' yield scrapy.Request(url='https://www.stlouisco.com/Portals/8/docs/Document%20Library/Public%20Works/code%20enforcement/licenses/mech/Internet%20Contractor%20List%20-%20Jan%202019.pdf',callback=self.pdf_content,dont_filter=True,meta={'file2':file2}) if file3: file2='plum' parse_value=yield scrapy.Request(url='https://www.stlouisco.com/Your-Government/Public-Works/Licensing/LicPlumb',dont_filter=True) link=parse_value.xpath('//*[@id="dnn_ctr8308_HtmlModule_lblContent"]/table//tr[2]/td[2]/ul/li/a/@href').extract() for i in link: link_url='https://www.stlouisco.com'+str(i) yield scrapy.Request(url=link_url,callback=self.pdf_content,dont_filter=True,meta={'file2':file2}) if file4: file2='back' parse_value=yield scrapy.Request(url='https://www.stlouisco.com/Your-Government/Public-Works/Licensing/LicBackFlow',dont_filter=True) link=parse_value.xpath('//*[@id="dnn_ctr7825_HtmlModule_lblContent"]/table//tr[2]/td[2]/ul/li/a/@href').extract() for i in link: link_url='https://www.stlouisco.com'+str(i) yield scrapy.Request(url=link_url,callback=self.pdf_content,dont_filter=True,meta={'file2':file2}) def pdf_content(self, response): meta={} file_name=response.meta['file2'] url=response.url file=self.storeGet_tempfile(response) #electrical if str(file_name)=='elec': meta['permit_subtype']=meta['company_name']=meta['dba_name']=meta['person_name']=meta['person_subtype']=meta['type/licensed contractors']=meta['location_address_string']=meta['mail_address_string']=meta['company_phone']=meta['permit_lic_desc']=meta['permit_type']='' df =tabula.read_pdf(file,pages='all',encoding='ISO-8859-1',guess=False,columns=[211.905,333.54,391.68,520.2,593.64],area=[16.448,11.475,735.548,595.17],pandas_options={'header': 'infer'}) for _, row in df.fillna('').iterrows(): row=row.tolist() meta['permit_subtype']='ELECTRICAL LICENSING' company_name = row[0] meta['company_name'] =self._getDBA(company_name)[0] person_name=row[1] meta['person_name']=self._getDBA(person_name)[0] meta['person_subtype']='License Holder' meta['type/licensed contractors']=row[2] meta['location_address_string']=row[3] meta['company_phone']=row[4] meta['permit_lic_desc']='ELECTRICAL LICENSING' meta['permit_type']='electrical_contractor_license' dba_name1=self._getDBA(company_name)[1] dba_name2=self._getDBA(person_name)[1] if meta['company_name']: meta['company_name']=meta['company_name'] else: if meta['person_name']: meta['company_name']=meta['person_name'] if dba_name1: meta['dba_name']=dba_name1 yield self.save_to_csv(response,**meta) if dba_name2: meta['dba_name']=dba_name2 yield self.save_to_csv(response,**meta) else: yield self.save_to_csv(response,**meta) #mechanical if str(file_name)=='mech': meta['permit_subtype']=meta['company_name']=meta['dba_name']=meta['person_name']=meta['person_subtype']=meta['type/licensed contractors']=meta['location_address_string']=meta['mail_address_string']=meta['company_phone']=meta['permit_lic_desc']=meta['permit_type']='' df =tabula.read_pdf(file,pages='all',encoding='ISO-8859-1',guess=False,spreadsheet=True,pandas_options={'header': 'infer'}) for _, row in df.fillna('').iterrows(): row=row.tolist() meta['permit_subtype']='MECHANICAL LICENSING' meta['type/licensed contractors']='Licensed Mechanical Contractors' company_name = row[0] meta['company_name'] =self._getDBA(company_name)[0] meta['dba_name']=self._getDBA(company_name)[1] meta['location_address_string']=row[1]+', '+row[2]+', '+row[3]+' '+row[4] meta['company_phone']=row[5] meta['permit_lic_desc']='MECHANICAL LICENSING' meta['permit_type']='contractor_license' yield self.save_to_csv(response,**meta) #plumbing if str(file_name)=='plum': meta['permit_subtype']=meta['company_name']=meta['dba_name']=meta['person_name']=meta['person_subtype']=meta['type/licensed contractors']=meta['location_address_string']=meta['mail_address_string']=meta['company_phone']=meta['permit_lic_desc']=meta['permit_type']='' df =tabula.read_pdf(file,pages='all',encoding='ISO-8859-1',guess=True,stream=True,pandas_options={'header': 'infer'}) for _, row in df.fillna('').iterrows(): row=row.tolist() company_name = row[0] person_name= row[1]+' '+row[2] meta['company_name'] =self._getDBA(company_name)[0] meta['person_name']=self._getDBA(person_name)[0] meta['mail_address_string']=row[3]+', '+row[4]+', '+row[5]+' '+row[6] meta['company_phone']=row[7] meta['permit_type']='plumbing_contractor_license' meta['permit_subtype']='PLUMBING LICENSING' if meta['company_name']: meta['company_name']=meta['company_name'] else: if meta['person_name']: meta['company_name']=meta['person_name'] dba_name1=self._getDBA(company_name)[1] dba_name2=self._getDBA(person_name)[1] if str(url)=='https://www.stlouisco.com/Portals/8/docs/document%20library/public%20works/code%20enforcement/licenses/plumb/Master-Drainlayers.pdf?VR=0619': meta['permit_lic_desc']='PLUMBING LICENSING-Master Drainlayers' meta['type/licensed contractors']='Master Drainlayers' if dba_name1: meta['dba_name']=dba_name1 yield self.save_to_csv(response,**meta) if dba_name2: meta['dba_name']=dba_name2 yield self.save_to_csv(response,**meta) else: yield self.save_to_csv(response,**meta) elif str(url)=='https://www.stlouisco.com/Portals/8/docs/Document%20Library/Public%20Works/code%20enforcement/licenses/plumb/Monthly%20Public%20Information%20-%20Master%20PipeFitters_New.pdf': meta['permit_lic_desc']='PLUMBING LICENSING-Master PipeFitters' meta['type/licensed contractors']='Master PipeFitters' if dba_name1: meta['dba_name']=dba_name1 yield self.save_to_csv(response,**meta) if dba_name2: meta['dba_name']=dba_name2 yield self.save_to_csv(response,**meta) else: yield self.save_to_csv(response,**meta) elif str(url)=='https://www.stlouisco.com/Portals/8/docs/Document%20Library/Public%20Works/code%20enforcement/licenses/plumb/Master-Plumbers.pdf?VR=0619': meta['permit_lic_desc']='PLUMBING LICENSING-Master Plumbers' meta['type/licensed contractors']='Master Plumbers' if dba_name1: meta['dba_name']=dba_name1 yield self.save_to_csv(response,**meta) if dba_name2: meta['dba_name']=dba_name2 yield self.save_to_csv(response,**meta) else: yield self.save_to_csv(response,**meta) elif str(url)=='https://www.stlouisco.com/Portals/8/docs/Document%20Library/Public%20Works/code%20enforcement/licenses/plumb/Monthly%20Public%20Information%20-%20Master%20SprinklerFitters_New.pdf': meta['permit_lic_desc']='PLUMBING LICENSING-Master SprinklerFitters' meta['type/licensed contractors']='Master SprinklerFitters' if dba_name1: meta['dba_name']=dba_name1 yield self.save_to_csv(response,**meta) if dba_name2: meta['dba_name']=dba_name2 yield self.save_to_csv(response,**meta) else: yield self.save_to_csv(response,**meta) elif str(url)=='https://www.stlouisco.com/Portals/8/docs/document%20library/public%20works/code%20enforcement/licenses/plumb/Monthly%20Public%20Information%20-%20Master%20Water%20Heater%20Contractors.pdf': meta['permit_lic_desc']='PLUMBING LICENSING-Master Water Heater Contractors' meta['type/licensed contractors']='Master Water Heater Contractors' if dba_name1: meta['dba_name']=dba_name1 yield self.save_to_csv(response,**meta) if dba_name2: meta['dba_name']=dba_name2 yield self.save_to_csv(response,**meta) else: yield self.save_to_csv(response,**meta) #black Flow if str(file_name)=='back': meta['permit_subtype']=meta['company_name']=meta['dba_name']=meta['person_name']=meta['person_subtype']=meta['type/licensed contractors']=meta['location_address_string']=meta['mail_address_string']=meta['company_phone']=meta['permit_lic_desc']=meta['permit_type']='' if str(url)=='https://www.stlouisco.com/Portals/8/docs/Document%20Library/Public%20Works/code%20enforcement/licenses/Backflow/Lawn%20Irrigation%20Contractors.pdf' or str(url)=='https://www.stlouisco.com/Portals/8/docs/Document%20Library/Public%20Works/code%20enforcement/licenses/Backflow/Plumbing%20Contractors.pdf': def __extractPdf(self,response): df =tabula.read_pdf(file,pages='all',encoding='ISO-8859-1',guess=False,area=[88.358,22.95,738.608,597.465],columns=[154.53,285.345,415.925,543.15],pandas_options={'header': None}) asd=[df[i] for i in df.columns.values] result=pd.concat(asd).reset_index(drop=True) df = result.to_frame(name=None) df[1] = df.apply(lambda x:x[0] if str(x[0]).startswith('(') else np.nan, axis = 1) def fillUniqueNum(v): if fillUniqueNum.change: fillUniqueNum.unique_num += 1 fillUniqueNum.change = False if str(v[0]).startswith('('): fillUniqueNum.change = True return str(fillUniqueNum.unique_num) fillUniqueNum.change = False fillUniqueNum.unique_num = 1 df[2]= df.apply(lambda v:fillUniqueNum(v), axis=1) df = df[[0, 1, 2]] df = df.groupby(2) for val, i in enumerate(df): x= pd.DataFrame(i[1]).reset_index(drop=True) x = x.drop(columns=2) if x.apply(len).values[0]>2: x = x.dropna(how='all') try: x[0] = x.apply(lambda x:x[0] if not str(x[0]).startswith('(') else np.nan, axis = 1) x = x.apply(lambda x: pd.Series(x.dropna().values)) x[2] = x[0][1:] x = x.apply(lambda x: pd.Series(x.dropna().values)) x[3] = ', '.join(x[2].tolist()[:-1]) x = x.drop(columns=2) x= x.dropna() x.columns = ['company_name', 'phone', 'loc1'] final_df = x.to_dict('records') yield final_df except ValueError: pass for col in __extractPdf(file,response): for row in col: company_name=row['company_name'] company_phone=row['phone'] location_address_string=row['loc1'] meta['permit_subtype']='BACKFLOW TESTING' meta['company_name']= self._getDBA(company_name)[0] meta['dba_name']=self._getDBA(company_name)[1] meta['permit_type']='contractor_license' meta['company_phone']=company_phone meta['location_address_string']=location_address_string if str(url)=='https://www.stlouisco.com/Portals/8/docs/Document%20Library/Public%20Works/code%20enforcement/licenses/Backflow/Lawn%20Irrigation%20Contractors.pdf': meta['permit_lic_desc']='BACKFLOW TESTING-Lawn Irrigation Contractors ' meta['type/licensed contractors']='Lawn Irrigation Contractors' yield self.save_to_csv(response,**meta) elif str(url)=='https://www.stlouisco.com/Portals/8/docs/Document%20Library/Public%20Works/code%20enforcement/licenses/Backflow/Plumbing%20Contractors.pdf': meta['permit_lic_desc']='BACKFLOW TESTING-Plumbing Contractors' meta['type/licensed contractors']='Plumbing Contractors' yield self.save_to_csv(response,**meta) elif str(url)=='https://www.stlouisco.com/Portals/8/docs/document%20library/public%20works/code%20enforcement/licenses/backflow/Fire%20Suppression%20Contractors.pdf': df =tabula.read_pdf(file,pages='all',encoding='ISO-8859-1',guess=False,area=[88.358,22.95,738.608,597.465],columns=[154.53,285.345,416.925,543.15],pandas_options={'header': None}) asd=[df[i] for i in df.columns.values] result=pd.concat(asd).reset_index(drop=True) df = result.to_frame(name=None) df[1] = df.apply(lambda x:x[0] if str(x[0]).startswith('(') else np.nan, axis = 1) df[1] = df[1].shift(-1) df[0] = df.apply(lambda x:x[0] if not str(x[0]).startswith('(') else np.nan, axis = 1) df = df.dropna(how='all') for _, row in df.fillna('').iterrows(): row=row.tolist() meta['permit_subtype']='BACKFLOW TESTING' company_name=row[0] meta['company_name']= self._getDBA(company_name)[0] meta['dba_name']=self._getDBA(company_name)[1] meta['company_phone']=row[1] meta['location_address_string']='MO' meta['permit_lic_desc']='BACKFLOW TESTING-Fire Suppression Contractors' meta['permit_type']='contractor_license' meta['type/licensed contractors']='Fire Suppression Contractors' yield self.save_to_csv(response,**meta) elif str(url)=='https://www.stlouisco.com/Portals/8/docs/Document%20Library/Public%20Works/code%20enforcement/licenses/Backflow/Process%20Piping%20Contractors.pdf': df =tabula.read_pdf(file,pages='all',encoding='ISO-8859-1',guess=False,area=[88.358,22.95,738.608,597.465],columns=[154.53,285.345,416.925,543.15],pandas_options={'header': None}) asd=[df[i] for i in df.columns.values] result=pd.concat(asd).reset_index(drop=True) df = result.to_frame(name=None) df[1] = df.apply(lambda x:x[0] if str(x[0]).startswith('(') else np.nan, axis = 1) df[1] = df[1].shift(-1) df[0] = df.apply(lambda x:x[0] if not str(x[0]).startswith('(') else np.nan, axis = 1) df = df.dropna(how='all') for _, row in df.fillna('').iterrows(): row=row.tolist() meta['permit_subtype']='BACKFLOW TESTING' company_name=row[0] meta['company_name']= self._getDBA(company_name)[0] meta['dba_name']=self._getDBA(company_name)[1] meta['company_phone']=row[1] meta['location_address_string']='MO' meta['permit_lic_desc']='BACKFLOW TESTING-Process Piping Contractors' meta['permit_type']='contractor_license' meta['type/licensed contractors']='Process Piping Contractors' yield self.save_to_csv(response,**meta) def save_to_csv(self,response,**meta): il = ItemLoader(item=MoStlouisContractorLicensesSpiderItem()) il.add_value('sourceName', 'MO_StLouis_Contractor_Licenses') il.add_value('url', 'https://www.stlouisco.com/YourGovernment/PublicWorks/Licensing') il.add_value('permit_subtype',meta['permit_subtype']) il.add_value('mail_address_string',meta['mail_address_string']) il.add_value('dba_name',meta['dba_name']) il.add_value('company_name',meta['company_name']) il.add_value('type/licensed contractors',meta['type/licensed contractors']) il.add_value('person_subtype',meta['person_subtype']) il.add_value('person_name',meta['person_name']) il.add_value('permit_type',meta['permit_type']) il.add_value('permit_lic_desc',meta['permit_lic_desc']) il.add_value('company_phone',meta['company_phone']) il.add_value('location_address_string',meta['location_address_string']) return il.load_item() def storeGet_tempfile(self,response): outfd, temp_path = tempfile.mkstemp(prefix='', suffix='') with os.fdopen(outfd, 'wb') as pdf_file: pdf_file.write(response.body) return temp_path
class IlDupageFoodInspectionsSpider(CommonSpider): name = '1372_il_dupage_food_inspections' allowed_domains = ['dupagehealth.org'] start_urls = ['https://eco.dupagehealth.org/#/pa1/search'] custom_settings = { 'FILE_NAME': Utils.getRundateFileName( 'AI-1372_Inspections_Food_IL_Dupage_CurationReady'), 'JIRA_ID': 'AI_1372', 'COOKIES_ENABLED': True, 'DOWNLOAD_DELAY': 0.5, # 'TRACKING_OPTIONAL_PARAMS':['company_name'], 'COOKIES_DEBUG': True, 'HTTPCACHE_ENABLED': False, 'CONCURRENT REQUESTS': 1, # 'JOBDIR' : CustomSettings.getJobDirectory('IlDupageFoodInspectionsSpider'), 'TOP_HEADER': { 'abate_date': '', 'abate_status': '', 'company_name': 'Facility Name', 'dba_name': '', 'inspection_date': '', 'inspection_subtype': 'Inspection Type', 'inspection_type': '', 'location_address_string': 'Address', 'violation_date': '', 'violation_description': '', 'violation_rule': '', 'violation_type': '' }, 'FIELDS_TO_EXPORT': [ 'company_name', 'dba_name', 'location_address_string', 'inspection_subtype', 'inspection_date', 'inspection_type', 'violation_date', 'violation_rule', 'violation_description', 'abate_date', 'abate_status', 'violation_type', 'url', 'sourceName', 'ingestion_timestamp' ], 'NULL_HEADERS': [] } def parse(self, response): # print('---------------------',response.text) form_Data = {"FacilityName": '%'} yield scrapy.FormRequest( url= "https://eco.dupagehealth.org/api/pressAgentClient/searchFacilities?PressAgentOid=168b2416-e0e5-4907-b866-a7b400f66c46", dont_filter=True, method='POST', formdata=form_Data, callback=self.parse_dtl) @inline_requests def parse_dtl(self, response): meta = {} jsonresponse = json.loads(response.body_as_unicode()) for i in jsonresponse: company_name = i['FacilityName'] Address = i['Address'] address1 = (i['CityStateZip']).replace(' IL ', ', IL ') location_address_string = str(Address) + ', ' + str(address1) ids = i['FacilityId'] meta = { 'company_name': company_name, 'location_address_string': location_address_string, 'inspection_subtype': '', 'inspection_date': '', 'violation_date': '', 'violation_rule': '', 'violation_description': '', 'abate_date': '', 'abate_status': '', 'violation_type': '', 'inspection_type': '' } linkjoin = 'https://eco.dupagehealth.org/api/pressAgentClient/programs?FacilityId=' + str( ids) + '&PressAgentOid=168b2416-e0e5-4907-b866-a7b400f66c46' parse_get = yield scrapy.Request(url=linkjoin, dont_filter=True) jsonresponse1 = json.loads(parse_get.body_as_unicode()) for j in jsonresponse1: programs_id = j['ProgramId'] insjoin = 'https://eco.dupagehealth.org/api/pressAgentClient/inspections?PressAgentOid=168b2416-e0e5-4907-b866-a7b400f66c46&ProgramId=' + str( programs_id) parse_ins = yield scrapy.Request(url=insjoin, dont_filter=True) ins_jsonresponse = json.loads(parse_ins.body_as_unicode()) if ins_jsonresponse and len(ins_jsonresponse) > 0: for k in ins_jsonresponse: meta['inspection_subtype'] = k['service'] meta['inspection_date'] = self.format_date( (k['activity_date']).replace('T00:00:00', '')) meta['inspection_type'] = 'health_inspection' violation = k['violations'] print('---------------', meta['inspection_subtype'], meta['inspection_date']) if violation: meta['abate_status'] = meta['abate_date'] = '' for m in violation: meta['violation_rule'] = m[ 'violation_description'] meta['violation_description'] = m['v_memo'] meta['violation_type'] = 'health_violation' meta['violation_date'] = meta[ 'inspection_date'] if meta['violation_description']: if ' COS' in meta[ 'violation_description'] or 'Corrected on-site' in meta[ 'violation_description'] or '(COS)' in meta[ 'violation_description']: meta['abate_status'] = 'COS' meta['abate_date'] = meta[ 'inspection_date'] else: meta['abate_status'] = '' meta['abate_date'] = '' yield self.save_to_csv(parse_ins, **meta) else: meta['violation_description'] = meta[ 'violation_rule'] = meta[ 'violation_type'] = meta[ 'violation_date'] = meta[ 'abate_status'] = meta[ 'abate_date'] = '' yield self.save_to_csv(parse_ins, **meta) else: meta['inspection_type'] = meta['inspection_date'] = meta[ 'inspection_subtype'] = meta[ 'violation_description'] = meta[ 'violation_rule'] = meta[ 'violation_type'] = meta[ 'violation_date'] = meta[ 'abate_status'] = meta[ 'abate_date'] = '' yield self.save_to_csv(parse_ins, **meta) def save_to_csv(self, response, **meta): il = ItemLoader(item=IlDupageFoodInspectionsSpiderItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars) #il.add_value('ingestion_timestamp', Utils.getingestion_timestamp()) il.add_value('sourceName', 'IL_Dupage_Food_Inspections') il.add_value('url', 'https://eco.dupagehealth.org/#/pa1/search') il.add_value( 'location_address_string', meta['location_address_string'] if meta['location_address_string'] else 'IL') il.add_value('abate_date', meta['abate_date']) il.add_value('inspection_date', meta['inspection_date']) il.add_value( 'company_name', self._getDBA(meta['company_name'])[0] if meta['company_name'] else '') il.add_value('violation_type', meta['violation_type']) il.add_value('violation_description', meta['violation_description']) il.add_value( 'dba_name', self._getDBA(meta['company_name'])[1] if meta['company_name'] else '') il.add_value('inspection_type', meta['inspection_type']) il.add_value('violation_date', meta['violation_date']) il.add_value('abate_status', meta['abate_status']) il.add_value('inspection_subtype', meta['inspection_subtype']) il.add_value('violation_rule', meta['violation_rule']) return il.load_item()