def __init__(self, index=1, browser='chrome', delay=1, url=None, site=None): logPath = getPath(N=0) + f'logs{sep}{site}{sep}' if site is not None else getPath(N=0) + f'logs{sep}' self._logger = Logger(f'crawler_{index}.log', logPath).getLogger() self._driver = self.getBrowser(browser) self._driver.maximize_window() # fullscreen_window() # Maximize browser window self.update_delay(delay) # update delay self.update_page(url) # open url self._logger.info('Finished crawler Initialize')
def __init__(self, num_of_crawlers=0, site=None): logPath = getPath(N=0) + f'logs{sep}{site}{sep}' if site is not None else getPath(N=0) + f'logs{sep}' self.logger = Logger(f'{site}_Scraper.log', logPath).getLogger() self.db = DB(logger=self.logger).getDB(site) self.num_of_crawlers = min(cpu_count(), 4) if num_of_crawlers == 0 else num_of_crawlers # 0 => 4 threads, else num self.productsFolder = getPath(N=0) + f'products{sep}json_products{sep}' # product path self.backupFolder = getPath(N=0) + f'products{sep}backup_json_products{sep}' createDir(self.productsFolder)
def main(): _logger = Logger('parser.log', getPath(N=0) + f'logs{sep}').getLogger() _logger.info("Parser is Starting") run(unhandledFolder, _logger, minDelay=0) while True: _logger.info("Parser is Starting") run(readFolder, _logger)
def downloadSync(loop=True): _logger = Logger('downloadSync.log', getPath(N=0) + f'logs{sep}').getLogger() while True: total = 0 db = DB().getDB('SupremeCourt') for folder in downloadFolders: counter = 0 connection = db.get_collection(getFolderName(folder)) cursor = list(connection.find({})) fileList = [ file.replace(folder, '') for file in getFiles(folderPath=folder) ] # extract file name for file in cursor: if file['name'] not in fileList: saveData(file['data'], file['name'], folder) counter += 1 total += counter _logger.info( f"Total {counter} files ware downloaded into {folder}") _logger.info(f"Total {total} files ware downloaded") if loop is False: break callSleep(logger=_logger, hours=1)
def main(): _logger = Logger('elasticsearch.log', getPath(N=0) + f'logs{sep}').getLogger() while True: Elastic_5_5_3( _logger).start_index() # start index product to elastic DB callSleep( logger=_logger, minutes=10 ) # after finished with all the files wait a bit - hours * minutes * seconds
def getBrowser(browser='chrome'): path = f'ILCourtScraper{sep}WebDrivers{sep}' if browser == 'chrome': return webdriver.Chrome(ChromeDriverManager().install()) elif browser == 'firefox': return webdriver.Firefox(GeckoDriverManager().install()) elif browser == 'edge': if system() == 'Windows': return webdriver.Edge(executable_path=getPath(N=0) + path + 'msedgedriver.exe')
def uploadSync(loop=True): _logger = Logger('uploadSync.log', getPath(N=0) + f'logs{sep}').getLogger() while True: total = 0 uCounter = 0 sCounter = 0 db = DB().getDB('SupremeCourt') for folder in uploadFolders.keys(): connection = db.get_collection(getFolderName(folder)) cursor = list(connection.find({})) backupFileList = [file['name'] for file in cursor] listOfFiles = getFiles(folderPath=folder) total += len(listOfFiles) _logger.info( f"Got {len(listOfFiles)} files to upload in folder {folder}..." ) if len(listOfFiles) > 0: index = 0 for fileName in listOfFiles: index += 1 _logger.info( f"Starting to upload file {index} of {len(listOfFiles)}... " ) data = readData(fileName, '') fixData(fileName, data) fullFilePath = fileName fileName = fileName.replace(folder, '') # extract file name if fileName not in backupFileList: try: connection.insert_one({ "name": fileName, "data": data }) uCounter += 1 _logger.info(f"Succeed to upload file {fileName}") if folder != uploadFolders[ folder]: # move file if folders are different changeDir( fullFilePath, uploadFolders[folder], deleteSourceIfDestinationFileExist=True) except Exception as e: # TODO better Exception _logger.info( f"Failed to upload file {fullFilePath}") _logger.info(e) else: _logger.info("Skipped") sCounter += 1 _logger.info( f"{uCounter} files Uploaded...\n{sCounter} files Skipped...\n{total - uCounter - sCounter} Failed...\nTotal {total} files" ) if loop is False: break callSleep(logger=_logger, minutes=10)
def readData(fileName, filePath=None): try: filePath = getPath() if filePath is None else filePath with open(filePath + fileName, encoding='utf8') as json_file: data = json.load(json_file) return data except JSONDecodeError as e: print(f'Error in decoding this file: {fileName}') print(e) return ''
from ILCourtScraper.Extra.logger import Logger from ILCourtScraper.Extra.time import callSleep from ILCourtScraper.Extra.json import readData, saveData from ILCourtScraper.Extra.path import getPath, sep, createDir, getFiles, remove readFolder = getPath(N=0) + f'products{sep}json_products{sep}' handledFolder = getPath(N=0) + f'products{sep}handled_json_products{sep}' unhandledFolder = getPath(N=0) + f'products{sep}unhandled_json_products{sep}' for f in [readFolder, handledFolder, unhandledFolder]: createDir(f) def clean_spaces(text): if type(text) is str: # if text is a string if '\n' in text: # if there is more than one line return clean_spaces(text.splitlines()) # resend it as list else: # if text is a list for index in range(len(text)): # for each line in the list text[index] = clean_spaces(text[index]) # resend one line return text temp_list = list() # the return list of characters space = ' ' for index in range(len(text)): if text[index] == space: # if this a space if index != 0: # if we not on the first index if text[index - 1] == space: # if we saw a space don't add this one continue else: # in this case we do want to add
def saveData(data, fileName=None, filePath=None): fileName = f"dataFromScraper_{currTime()}" if fileName is None else fileName filePath = getPath() if filePath is None else filePath with open(filePath + fileName, 'w') as outfile: json.dump(data, outfile, indent=4, ensure_ascii=True)
from ILCourtScraper.Extra.db import DB from ILCourtScraper.Extra.logger import Logger from ILCourtScraper.Extra.time import callSleep from ILCourtScraper.Extra.json import readData, saveData from ILCourtScraper.Extra.path import getPath, sep, getFiles, createDir, changeDir handledFolder = getPath(N=0) + f'products{sep}handled_json_products{sep}' unhandledFolder = getPath(N=0) + f'products{sep}unhandled_json_products{sep}' backupFolder = getPath(N=0) + f'products{sep}backup_json_products{sep}' unBackupFolder = getPath(N=0) + f'products{sep}unBackup_json_products{sep}' elasticFolder = getPath(N=0) + f'products{sep}upload_json_to_elastic{sep}' # key = source, value = destination uploadFolders = { handledFolder: handledFolder, unhandledFolder: unhandledFolder, backupFolder: backupFolder, unBackupFolder: backupFolder, elasticFolder: elasticFolder } downloadFolders = [handledFolder, backupFolder] for f in [ handledFolder, unhandledFolder, backupFolder, unBackupFolder, elasticFolder ]: createDir(f) def getFolderName(folder): return folder.split(sep)[-2]