def startLoop(startNumber, mode, location): #START_NUMBER = 107200 START_NUMBER = startNumber STOP_NUMBER = START_NUMBER + 10000 STEP_NUMBER = 10 #proxies = startPrivateProxy() proxyPath = 'D:/Proxy/Filter/good_proxy_list.txt' myProxy = ProxyEngine(proxyPath) myProxy.loadData() for i in range(START_NUMBER, STOP_NUMBER, STEP_NUMBER): user_agent = getRamdomUserAgent() headers = {'User-Agent': user_agent} proxyOK = False proxies = {} while not proxyOK: proxies = myProxy.nextProxy() try: urlTest = "http://icanhazip.com" resTest = requests.get(urlTest, proxies=proxies, headers=headers) print('using IP:', resTest.text) proxyOK = True except: print('Error with proxy:', proxies) if proxies: runDownLoad(i, proxies, headers, mode, location) time.sleep(15)
def main(startNumber, mode, location): #START_NUMBER = 107200 START_NUMBER = startNumber STOP_NUMBER = START_NUMBER + 50000 STEP_NUMBER = 100 proxies = startPrivateProxy() for i in range(START_NUMBER, STOP_NUMBER, STEP_NUMBER): user_agent = getRamdomUserAgent() headers = {'User-Agent': user_agent} RunCode(i, proxies, headers, mode, location) time.sleep(10)
def startLoop(startNumber, mode, location): #START_NUMBER = 107200 START_NUMBER = startNumber STOP_NUMBER = START_NUMBER + 10000 STEP_NUMBER = 10 proxies = startPrivateProxy() for i in range(START_NUMBER, STOP_NUMBER, STEP_NUMBER): user_agent = getRamdomUserAgent() headers = {'User-Agent': user_agent} runDownLoad(i, proxies, headers, mode, location) time.sleep(15)
def verifyProxy(proxyList, outPath): statList = [] dateStamp = sysHand.getDateStamp() statList.append('Start verifying proxy at ' + dateStamp) file = open(outPath, 'a', encoding='utf-8') for item in proxyList: user_agent = getRamdomUserAgent() headers = {'User-Agent': user_agent} proxies = {} proxies['http'] = item proxies['https'] = item print('Verifying:', item) print('using agent:', headers['User-Agent']) print('using proxy:', proxies) try: urlTest = "http://icanhazip.com" resTest = requests.get(urlTest, proxies=proxies, headers=headers) print('using IP:', resTest.text) file.write(str(item) + '\n') statList.append('Sucessfully verified IP ' + str(item)) time.sleep(2) except urllib3.exceptions.ConnectTimeoutError: statList.append('Connection Time Out Error with IP ' + str(item)) print('Connection Time Out Error with IP ' + str(item)) except urllib3.exceptions.ConnectionError: print('Connection Error with IP ' + str(item)) statList.append('Connection Error with IP ' + str(item)) except urllib3.exceptions.MaxRetryError: print('Max Retry Error with IP ' + str(item)) statList.append('Max Retry Error with IP ' + str(item)) except Exception as e: statList.append('Error verifying IP ' + str(item)) print(e) file.close() dateStamp = sysHand.getDateStamp() statList.append('Finish verifying proxy at ' + dateStamp) return statList
def main(dirOut): WORD = "z" START_NUMBER = 2 STOP_NUMBER = 3 STEP_NUMBER = 1 proxies = startPrivateProxy() for i in range(START_NUMBER, STOP_NUMBER, STEP_NUMBER): print('processing list ', WORD, i) user_agent = getRamdomUserAgent() headers = {'User-Agent': user_agent} INDEX = str(i) pathOut = dirOut + '/list' + WORD + INDEX + ".txt" htmlContent = getLexico(WORD, INDEX, proxies, headers) wordData = processLexicoList(htmlContent, WORD) writeListToFile(wordData, pathOut) time.sleep(10)
def startLoop(startNumber, mode, location): #START_NUMBER = 107200 START_NUMBER = startNumber STOP_NUMBER = START_NUMBER + 10000 STEP_NUMBER = 10 #pathList = 'D:/Proxy/List/raw_text_rudnkh.txt' #pathList = 'D:/Proxy/List/raw_text_clarketm.txt' #PATH TO PROXY LIST #pathList = 'D:/Proxy/Filter/good_proxy_list.txt' pathList = 'D:/Proxy/List/raw_list_general.txt' myProxy = ProxyEngine (pathList) myProxy.loadData() pathDict = 'E:/FULLTEXT/DICTIONARY/SPECIALTY/British_standards.txt' myDict = DictEngine(pathDict, START_NUMBER) myDict.loadData() for i in range(START_NUMBER, STOP_NUMBER, STEP_NUMBER): user_agent = getRamdomUserAgent() headers = {'User-Agent': user_agent} proxies = {} proxyOK = False while not proxyOK: try: proxies = myProxy.nextProxy() badIP = proxies['http'] print('Attemping on', badIP, '...') urlTest = "http://icanhazip.com" resTest = requests.get(urlTest, proxies=proxies, headers=headers) #print('using IP:', resTest.text) proxyOK = True except Exception as e: print('Test failed on icanhazip.com') if proxies: print(proxies) nextWord = myDict.nextWord() print('nextWord', nextWord) #url = 'https://www.dictionary.com/browse/' + nextWord #https://dictionary.cambridge.org/dictionary/english/fancy url = 'https://dictionary.cambridge.org/dictionary/english/' + nextWord try: response = requests.get(url, proxies=proxies, headers=headers) if response.status_code == 200: pathLog = 'D:/Proxy/Filter/good_proxy_list.txt' goodIP = proxies['http'] print('Discoved good proxy:', goodIP) with open(pathLog, "a") as myfile: myfile.write(goodIP + '\n') except Exception as e: #print (e) print('Test failed on dictionary.cambridge.org') #runDownLoad(i, proxies, headers, mode, location) time.sleep(3)
if (response.status_code == DATA_STATUS_OK): if (response.content): try: soup = BeautifulSoup(response.content, 'lxml') #statusMessage = "Successfully get the word: " + word #print(data) return (str(soup)) except: #statusMessage = "An exception occurred while getting " + word return (None) if __name__ == "__main__": WORD = "0" INDEX = "3" dirOut = "E:/FULLTEXT/LEXICO/LIST/HTML" pathOut = dirOut + '/list' + WORD + INDEX + ".html" proxies = startPrivateProxy() user_agent = getRamdomUserAgent() headers = {'User-Agent': user_agent} htmlContent = getLexico(WORD, INDEX, proxies, headers) if (htmlContent): with open(pathOut, "w", encoding='utf-8') as file: file.write(htmlContent) openDir(dirOut)