def process_files(global_config, attr_definitions, input_dir, recursive=True): start_time = datetime.datetime.now() # Initialize the database session connection db_name = global_config["db_name"] + global_config["this_season"] session = DbSession.open_db_session(db_name) some_files_processed = False # read the ignore file list config each time through the loop. Any files # in the ignore list will be skipped ignore_filelist = read_ignore_filelist_cfg(input_dir + "IgnoreFiles.txt") # The following regular expression will select all files that conform to # the file naming format Team*.txt. Build a list of all datafiles that match # the naming format within the directory passed in via command line # arguments. file_regex = re.compile("Team[a-zA-Z0-9_]+.txt") files = get_files(global_config, session, db_name, input_dir, file_regex, recursive) if len(files) > 0: log_msg = "files retrieved, elapsed time - %s" % (str(datetime.datetime.now() - start_time)) print log_msg global_config["logger"].debug("%s - %s" % (process_files.__name__, log_msg)) global_config["logger"].debug("%s - %d Files to be processed" % (process_files.__name__, len(files))) # Process data files for data_filename in files: # If the file is on the ignore list (quarantined), then skip it if data_filename.split("/")[-1] in ignore_filelist: global_config["logger"].debug("%s - Ignoring file: %s" % (process_files.__name__, data_filename)) continue # Make sure that the data file has not already been processed. We have seen cases # where the data file gets inserted into the list of files to be processed more than # once. file_processed = isFileProcessed(global_config, session, db_name, data_filename) if not file_processed: try: global_config["logger"].debug("%s - Processing file: %s" % (process_files.__name__, data_filename)) process_file(global_config, session, attr_definitions, data_filename) except Exception, e: global_config["logger"].debug( "%s - Error processing file: %s" % (process_files.__name__, data_filename) ) # log the exception but continue processing other files log_exception(global_config["logger"], e) # add the file to the set of processed files so that we don't process it again. Do it outside the # try/except block so that we don't try to process a bogus file over and over again. DataModel.addProcessedFile(session, data_filename) some_files_processed = True else: global_config["logger"].debug( "%s - Skipping file: %s, already processed" % (process_files.__name__, data_filename) ) # Commit all updates to the database session.commit()
def process_files(global_config, attr_definitions, input_dir, recursive=True): start_time = datetime.datetime.now() # Initialize the database session connection db_name = global_config['db_name'] session = DbSession.open_db_session(db_name) some_files_processed = False # The following regular expression will select all files that conform to # the file naming format Team*.txt. Build a list of all datafiles that match # the naming format within the directory passed in via command line # arguments. file_regex = re.compile('Team[a-zA-Z0-9_]+.txt') files = get_files(global_config, session, db_name, input_dir, file_regex, recursive) print 'files retrieved, elapsed time - %s' % (str(datetime.datetime.now()-start_time)) # Process data files for data_filename in files: try: process_file( global_config, session, attr_definitions, data_filename) except Exception, e: # log the exception but continue processing other files log_exception(global_config['logger'], e) # add the file to the set of processed files so that we don't process it again. Do it outside the # try/except block so that we don't try to process a bogus file over and over again. DataModel.addProcessedFile(session, data_filename) some_files_processed = True # Commit all updates to the database session.commit()
def process_files(session, db_name, attr_definitions, input_dir, recursive, test): # The following regular expression will select all files that conform to # the file naming format Team*.txt. Build a list of all datafiles that match # the naming format within the directory passed in via command line # arguments. file_regex = re.compile('Team[a-zA-Z0-9_]+.txt') files = get_files(session, db_name, input_dir, file_regex, recursive, test) # Process data files for data_filename in files: try: process_file( session, attr_definitions, data_filename) except Exception, e: # log the exception but continue processing other files log_exception(e) # add the file to the set of processed files so that we don't process it again. Do it outside the # try/except block so that we don't try to process a bogus file over and over again. DataModel.addProcessedFile(session, data_filename) # Commit all updates to the database session.commit()