def parse(self, response): rnd = lambda: str(int(round(time.time() * 1000))) self.searchkeys = [] if self.start.isalpha() and self.end.isalpha(): self.searchkeys = SearchCriteria.strRange(self.start, self.end) if 'AND' in self.searchkeys: self.searchkeys.remove('AND') if 'NOT' in self.searchkeys: self.searchkeys.remove('NOT') if 'BTI' in self.searchkeys: self.searchkeys.remove('BTI') else: self.searchkeys = SearchCriteria.numberRange( self.start, self.end, 1) # self.searchkeys=['INDUSTRIES'] print(len(self.searchkeys)) search = self.searchkeys.pop(0) # search='154 WEST RAYEN CORP' url = 'https://businesssearchapi.sos.state.oh.us/zyjLcCmoqeZffOn1ajJdsiek3tmuj9QtZVn{}_X?_={}'.format( str(search), rnd()) yield scrapy.Request(url=url, callback=self.parse_data, dont_filter=True, meta={'search': search}) self.ids = []
def parse(self, response): link = 'https://quickstart.sos.nh.gov' url = response.xpath('//iframe/@src').extract_first() image_url = link + url request = response.xpath( '*//input[@name="__RequestVerificationToken"]/@value' ).extract_first() if self.check_first: self.searchkeys = [] # searchkeys=[] if self.start.isalpha() and self.end.isalpha(): self.searchkeys = SearchCriteria.strRange(self.start, self.end) else: self.searchkeys = SearchCriteria.numberRange( self.start, self.end, 1) self.check_first = False if len(self.searchkeys) > 0: search = self.searchkeys.pop(0) formdata = { '__RequestVerificationToken': request, 'rbBusinessNameSearch': 'StartsWith', 'rbBasicSearch': 'BusinessName', 'txtBusinessName': str(search), 'chkWithSimilarSoundingBusinessNames': 'false', 'txtHomeStateBusiness': '', 'chkWithSoundingBusinessHomeStateNames': 'false', 'txtBusinessID': '', 'txtFilingNumber': '', 'txtBusCreationDate': '', 'ddlBusinessType': '', 'ddlBusinessStatus': '', 'txtCity': '', 'txtZipCode': '', 'County': '', 'txtAgentName': '', 'chkWithSimilarSoundingAgentNames': 'false', 'txtPrincipalName': '', 'chkWithSimilarSoundingPrincipalNames': 'false', 'txtCaptcha': self.getcaptchaCoder().resolveImgCaptcha(image_url), 'btnSearch': 'Search', 'hdnMessage': '' } yield scrapy.FormRequest( url= 'https://quickstart.sos.nh.gov/online/BusinessInquire/BusinessSearch', callback=self.parse_next, method='POST', formdata=formdata, dont_filter=True, meta={'page': 2})
def parse(self, response): if self.value: self.search_element = SearchCriteria.strRange(self.start, self.end) self.value = False if len(self.search_element) > 0: parm = self.search_element.pop(0) page_count_link = 'https://public.claycountygov.com/permitsearch/API/Search/Count?tab=Owner&sortfield=issuedate&sortdirection=D&owner=' + str( parm) + '&status=all&page=1&v=496' page_count_header = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.9,ta;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Content-Type': 'application/json', # DNT: 1 'Host': 'public.claycountygov.com', 'Referer': 'https://public.claycountygov.com/PermitSearch/', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' } yield scrapy.Request(url=page_count_link, callback=self.parse_user_det, headers=page_count_header, dont_filter=True, meta={"parm": parm})
def parse(self, response): main_page = yield scrapy.Request( url= "https://webpermit.mecklenburgcountync.gov/Default.aspx?PossePresentation=SearchByAddress", dont_filter=True) datac = main_page.xpath( "/html/body/div/form[1]/input[5]/@value").extract()[0] self.search_businames = SearchCriteria.strRange(self.start, self.end) for alpha in self.search_businames: for iv in range(0, 10): dc = ('C', 'S0', 447713, str(iv)), ('C', 'S0', 447712, str(alpha)), ('F', '', 0, 0) dc1 = datac + ',' + ''.join(map(str, str(dc))).replace( '((', '(').replace('))', ')').replace(' ', '') formdata1 = { 'currentpaneid': '467087', 'paneid': '467088', 'functiondef': '5', 'sortcolumns': '{}', 'datachanges': str(dc1), 'comesfrom': 'posse', 'changesxml': '' } yield scrapy.FormRequest( url= "https://webpermit.mecklenburgcountync.gov/Default.aspx?PossePresentation=SearchByAddress", formdata=formdata1, dont_filter=True, callback=self.parse2)
def parse_things(self, response): if self.value: self.search_element = SearchCriteria.strRange(self.start, self.end) self.value = False # print ('@@@@@@@@@22',self.search_element) if len(self.search_element) > 0: val = self.search_element.pop(0) form_data1 = { 'ctl00$cphMainApp$ToolkitScriptManager1': 'ctl00$cphMainApp$upSearch|ctl00$cphMainApp$ButtonPermitSearch', '__VIEWSTATEGENERATOR': 'DCE30F85', '__VIEWSTATE': response.xpath( "//input[@id='__VIEWSTATE']/@value").extract_first(), '__EVENTVALIDATION': response.xpath( "//input[@id='__EVENTVALIDATION']/@value").extract_first(), 'ctl00$cphMainApp$PermitSearchCriteria1$TextBoxRefLastName': str(val), '__ASYNCPOST': 'true', 'ctl00$cphMainApp$ButtonPermitSearch': 'Search For Permits' } yield scrapy.FormRequest( url='https://gcs.douglascountywi.org/gcswebportal/search.aspx', callback=self.parse_second, dont_filter=True, formdata=form_data1)
def verify_captcha(self, response): if len(self.search_element) == 0: self.search_element = SearchCriteria.strRange(self.start, self.end) if len(self.search_element) > 0: param = self.search_element.pop(0) print('--------------------param---------', param) jsonresponse = json.loads(response.body_as_unicode()) formdata = { 'county': '-1', 'fname': '', 'lictype': '-1', 'lname': str(param), 'lnumber': '', 'page': '1', 'pageSize': '20', 'sdata': [], 'sortby': '', 'sortexp': '', 'vid': jsonresponse['d'], } yield scrapy.Request( response.urljoin('/online/JS_grd/Grid.svc/GetIndv_license'), method='POST', body=json.dumps(formdata), headers={'Content-Type': 'application/json'}, callback=self.getIndv_license, meta={ 'page': '2', 'vid': jsonresponse['d'], 'param': param })
def parse(self, response): if len(self.searchkeys) > 0: self.page_no = 1 self.last_name = self.searchkeys.pop(0) self.searchkeys1 = SearchCriteria.strRange('aa', 'zz') yield scrapy.Request(url=self.start_urls[0], callback=self.parse_search, dont_filter=True)
def __init__(self, start=None, end=None, startnum=None, endnum=None, is_server=None, proxyserver=None, *a, **kw): super(CtTeachersLicensesSpider, self).__init__(start, end, proxyserver=None, *a, **kw) if start and startnum: self.searchkeys = SearchCriteria.numberRange( startnum, endnum, 1) + SearchCriteria.strRange(start, end) elif start: self.searchkeys = SearchCriteria.strRange(start, end) elif startnum: self.searchkeys = SearchCriteria.numberRange(startnum, endnum, 1)
def parse(self, response): if self.check_first: self.check_first = False self.search_element = SearchCriteria.dateRange( self.start, self.end, freq='1D', formatter='%m/%d/%Y') self.end_date = self.search_element.pop(0) if len(self.search_element) > 0: start_date = copy.copy(self.end_date) self.end_date = self.search_element.pop(0) formdata = { 'ctl00$RadScriptManager1': 'ctl00$RadScriptManager1|ctl00$cplMain$btnSearch', '__EVENTTARGET': ' ctl00$cplMain$btnSearch', '__EVENTARGUMENT': '', '__VIEWSTATE': response.xpath( '//*[@id="__VIEWSTATE"]/@value').extract_first(), '__VIEWSTATEGENERATOR': '2A136539', 'ctl00$ucLogin$hfDashboardRedirect': 'https://etrakit.champaignil.gov/etrakit/dashboard.aspx', 'ctl00$ucLogin$hfCartRedirect': 'https://etrakit.champaignil.gov/etrakit/ShoppingCart.aspx?iscartview=true', 'ctl00$ucLogin$hfHome': ' https://etrakit.champaignil.gov/etrakit/default.aspx', 'ctl00_ucLogin_RadTextBox2_ClientState': '{"enabled":true,"emptyMessage":"Password","validationText":"","valueAsString":"","lastSetTextBoxValue":"Password"}', 'ctl00_ucLogin_txtPassword_ClientState': '{"enabled":true,"emptyMessage":"","validationText":"","valueAsString":"","lastSetTextBoxValue":""}', 'ctl00$cplMain$ddSearchBy': 'Permit_Main.ISSUED', 'ctl00$cplMain$ddSearchOper': 'EQUALS', 'ctl00$cplMain$txtSearchString': str(start_date), 'ctl00_cplMain_tcSearchDetails_ClientState': '{"selectedIndexes":["2"],"logEntries":[],"scrollState":{}}', '__ASYNCPOST': 'true', 'RadAJAXControlID': 'ctl00_RadAjaxManager1' } yield scrapy.FormRequest(url=response.url, formdata=formdata, method='POST', dont_filter=True, callback=self.parse_details, meta={ 'page': 1, 'start_date': str(start_date) })
def parse_next(self, response): if self.start.isdigit(): self.search_elements = SearchCriteria.numberRange( self.start, self.end) elif self.start.isalpha(): self.search_elements = SearchCriteria.strRange( self.start, self.end) elif self.start.isalnum(): for s in self.alpha_list: for i in range(0, 10): search = s + str(i) self.search_elements.append(search) drop_down = response.xpath( '//*[@id="BizEntitySearch_EntityStatus"]/option/@value').extract( )[1:] if len(self.final_list) == 0: for search in self.search_elements: for drop in drop_down: self.final_list.append(drop) self.final_list.append(search) print('______________final_list:', self.final_list) if len(self.final_list) > 0: val = self.final_list.pop(0) search_ = self.final_list.pop(0) form_data3 = { "BizEntitySearch_String": str(search_), "Search": "Search", "BizEntitySearch_Type": "EntityName", "BizEntitySearch_DepthType": "StartsWith", "BizEntitySearch_EntityStatus": str(val), "BizEntitySearch_TradeNameStatus": "" } yield scrapy.FormRequest( url="https://corponline.dcra.dc.gov/Home.aspx/ProcessRequest", callback=self.parse_for_listpage, formdata=form_data3, method='POST', dont_filter=True, errback=self.handle_form_error)
def __init__(self, start=None, end=None, startnum=None, endnum=None, proxyserver=None, *a, **kw): super(AlMedicalPersonLicensesSpider, self).__init__(start, end, proxyserver=None, *a, **kw) self.search_element = SearchCriteria.strRange(start, end)
def __init__(self, start=None, end=None, startnum=None, endnum=None, proxyserver=None, *a, **kw): super(WyPhysicianLicensesSpider, self).__init__(start, end, proxyserver=None, *a, **kw) self.year = SearchCriteria.strRange(self.start, self.end)
def parse(self, response): entityType = [ 'COOPERATIVE', 'CORPORATION', 'LLC', 'PARTNERSHIP', 'TRADENAME' ] req_url = 'https://hbe.ehawaii.gov/documents/search.html' searchkeys = [] if self.start.isalpha() and self.end.isalpha(): searchkeys = SearchCriteria.strRange(self.start, self.end) else: searchkeys = SearchCriteria.numberRange(self.start, self.end, 1) self.search_lis = [] for key in searchkeys: for type in entityType: dic = {} dic['query'] = key dic['entityType'] = type self.search_lis.append(dic.copy()) dic.clear() # self.search_lis=[{'query':'PIRANHA GROUP LLC','entityType':'ALL'}] search = self.search_lis.pop(0) formdata = { 'beginsWith': 'true', 'query': search['query'], 'recordType': 'ALL', 'status': 'ALL', 'entityType': search['entityType'], 'page': '0' } yield scrapy.FormRequest(url=req_url, formdata=formdata, callback=self.parse_data, dont_filter=True, meta={'search': search}, errback=self.err_parse_data) self.links = []
def on_search(self, response): form_data = self.form_data(response) for form_data in [{ "ctl00$MainContent$txtSearchTerms": key } for key in SearchCriteria.rangeAAAAtoZZZZ()]: # form_data["ctl00$MainContent$txtSearchTerms"] ='AAAA' form_data["__VIEWSTATE"] = response.xpath( "//input[@id='__VIEWSTATE']/@value").extract_first() form_data["__VIEWSTATEGENERATOR"] = response.xpath( "//input[@id='__VIEWSTATEGENERATOR']/@value").extract_first() form_data["__EVENTVALIDATION"] = response.xpath( "//input[@id='__EVENTVALIDATION']/@value").extract_first() yield scrapy.FormRequest.from_response( response, url=self.form_url, callback=self.second_page_crawl, formdata=form_data)
def parse(self, response): if self.check_first: self.check_first = False self.search_element_a = SearchCriteria.strRange(self.starta,self.enda) self.search_element1='*'+str(self.search_element.pop(0)) if len(self.search_element_a) > 0: val = '*'+str(self.search_element_a.pop(0)) form_data={'ctl00$ContentPlaceHolder1$txtLastName': str(self.search_element1), 'ctl00$ContentPlaceHolder1$txtShopName': str(val), '__EVENTTARGET': 'ctl00$ContentPlaceHolder1$btnLicenseeSubmit'} head={'Connection': 'keep-alive' , 'Host': 'alboc.glsuite.us', 'Origin': 'https://alboc.glsuite.us','Upgrade-Insecure-Requests': '1' , 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' , 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application signed-exchange;v=b3', 'Referer': 'https://alboc.glsuite.us/GLSuiteWeb/Clients/ALBOC/public/VerificationSearch.aspx', 'Accept-Encoding': 'gzip, deflate, br','Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8' } yield scrapy.FormRequest.from_response(response,headers=head,formdata=form_data,formid='aspnetForm',method="POST",dont_filter=True,callback=self.parse_details)
def search(self, response): if self.check_first: self.check_first = False self.search_element = SearchCriteria.numberRange( int(self.start), int(self.end), 1) if len(self.search_element) > 0: param = self.search_element.pop(0) param = param.zfill(5) form_data = { 'ctl00$RadScriptManager1': 'ctl00$RadScriptManager1|ctl00$cplMain$btnSearch', 'RadScriptManager1_TSM': ';;System.Web.Extensions, Version=4.0.0.0, Culture=neutral, PublicKeyToken=31bf3856ad364e35:en-US:1453655a-6b8d-49b1-94c2-f77a352f5241:ea597d4b:b25378d2;Telerik.Web.UI, Version=2013.2.717.40, Culture=neutral, PublicKeyToken=121fae78165ba3d4:en-US:0507d587-20ad-4e22-b866-76bd3eaee2df:16e4e7cd:ed16cbdc:f7645509:24ee1bba:92fe8ea0:f46195d3:fa31b949:874f8ea2:19620875:490a9d4e:bd8f85e4:b7778d6c:58366029:e330518b:1e771326:8e6f0d33:6a6d718d;', 'ctl00$ucLogin$hfDashboardRedirect': 'https://www.cob.org/epermits/dashboard.aspx', 'ctl00$ucLogin$hfCartRedirect': 'https://www.cob.org/epermits/ShoppingCart.aspx', 'ctl00$ucLogin$hfViewEditProfile': 'static value', 'ctl00$ucLogin$hfHome': 'https://www.cob.org/epermits/default.aspx', 'ctl00$ucLogin$hfSetupAnAccountForPublic': 'https://www.cob.org/epermits/publicUserAccount.aspx?action=npa', 'ctl00$ucLogin$hfSetupAnAccountForContractor': 'https://www.cob.org/epermits/RegistrationConfirmation.aspx', 'ctl00$ucLogin$hfContractorCSLBVerification': 'DISABLED', 'ctl00$ucLogin$ddlSelLogin': '******', 'ctl00$ucLogin$txtLoginId': 'Username', 'ctl00_ucLogin_txtLoginId_ClientState': '{"enabled":true,"emptyMessage":"Username","validationText":"","valueAsString":"","lastSetTextBoxValue":"Username"}', 'ctl00$ucLogin$RadTextBox2': 'Password', 'ctl00_ucLogin_RadTextBox2_ClientState': '{"enabled":true,"emptyMessage":"Password","validationText":"","valueAsString":"","lastSetTextBoxValue":"Password"}', 'ctl00_ucLogin_txtPassword_ClientState': '{"enabled":true,"emptyMessage":"","validationText":"","valueAsString":"","lastSetTextBoxValue":""}', 'ctl00$hfGoogleKey': 'UA-5831706-1', 'ctl00$cplMain$hfActivityMode': '', 'ctl00$cplMain$ddSearchBy': 'Permit_Main.PERMIT_NO', 'ctl00$cplMain$ddSearchOper': 'CONTAINS', 'ctl00$cplMain$txtSearchString': str(param), 'ctl00_cplMain_rgSearchRslts_ClientState': '{"selectedIndexes":["0"],"selectedCellsIndexes":[],"unselectableItemsIndexes":[],"reorderedColumns":[],"expandedItems":[],"expandedGroupItems":[],"expandedFilterItems":[],"deletedItems":[],"hidedColumns":[],"showedColumns":[],"groupColsState":{},"hierarchyState":{},"popUpLocations":{},"draggedItemsIndexes":[]}', 'ctl00_cplMain_tcSearchDetails_ClientState': '{"selectedIndexes":["0"],"logEntries":[],"scrollState":{}}', '__EVENTTARGET': 'ctl00$cplMain$btnSearch', '__VIEWSTATE': response.xpath( '//*[@id="__VIEWSTATE"]/@value').extract_first(), '__VIEWSTATEGENERATOR': response.xpath( '//*[@id="__VIEWSTATEGENERATOR"]/@value').extract_first(), '__ASYNCPOST': 'true', 'RadAJAXControlID': 'ctl00_RadAjaxManager1' } currentPage = 1 yield FormRequest(url=self.start_urls[0], headers={'Referer': self.start_urls}, formdata=form_data, callback=self.parse_list, dont_filter=True, meta={ 'currentPage': currentPage, 'param': param })
def parse(self, response): self.searchkeys = [] if self.start.isalpha() and self.end.isalpha(): self.searchkeys = SearchCriteria.strRange(self.start, self.end) else: self.searchkeys = SearchCriteria.numberRange( self.start, self.end, 1) search = self.searchkeys.pop(0) self.form_data = { 'search.SearchName': 'express', 'search.SearchType': 'BusinessName', 'search.BusinessName': search, 'search.ActualBusinessName': search, 'search.BusinessId': '', 'search.SearchCriteria': 'StartsWith', 'search.CitizenShipType': '', 'search.BusinessTypeId': '0', 'search.BusinessStatusId': '', 'search.NaicsCode': '', 'search.businessStatus': '', 'search.isGoodStanding': '', 'search.Country': '', 'search.Zip': '', 'search.City': '', 'search.State': '', 'search.OtherState': '', 'search.PostalCode': '', 'search.AgentType': '', 'search.RAFirstName': '', 'search.RAMiddleName': '', 'search.RALastName': '', 'search.RASuffix': '', 'search.RAName': '', 'search.RAAddress1': '', 'search.RAAddress2': '', 'search.RACountry': '', 'search.RAZip': '', 'search.RACity': '', 'search.RAState': '', 'search.RAOtherState': '', 'search.RAPostalCode': '', 'search.DirectorFirstName': '', 'search.DirectorMiddleName': '', 'search.DirectorLastName': '', 'search.DirectorSuffix': '', 'search.IncorporatorType': '', 'search.IncorporatorFirstName': '', 'search.IncorporatorMiddleName': '', 'search.IncorporatorLastName': '', 'search.IncorporatorSuffix': '', 'search.IncorporatorEntityName': '', 'search.IncorporatorAddress1': '', 'search.IncorporatorAddress2': '', 'search.IncorporatorCountry': '', 'search.IncorporatorZip': '', 'search.IncorporatorCity': '', 'search.IncorporatorState': '', 'search.IncorporatorOtherState': '', 'search.IncorporatorPostalCode': '', 'search.OrganizerFirstName': '', 'search.OrganizerMiddleName': '', 'search.OrganizerLastName': '', 'search.OrganizerSuffix': '', 'search.ReservationNo': '', 'search.CaptchaResponse': self.getcaptchaCoder(self.site_key).resolver(response.url), } yield scrapy.FormRequest(response.url, formdata=self.form_data, dont_filter=True, method='POST', callback=self.parse_two, meta={'page': 2})