def test_calculate_hash(self): data = '01010101' * 200 data_hash = '3d1eb00cc63828b36882f076f35c8cdd' tmpname = tempfile.mktemp('hashtest') fp = open(tmpname, 'w') fp.write(data) fp.close() self.assertEqual(data_hash, calculate_hash(tmpname)) os.unlink(tmpname)
def scan_dir(parser, dirpath, expr, apel_db, processed): ''' Check all files in a directory and parse them if: - the names match the regular expression - the file is not already in the list of processed files Add newly parsed files to the processed files list and return it. ''' updated = [] try: log.info('Scanning directory: %s' % dirpath) for item in os.listdir(dirpath): abs_file = os.path.join(dirpath, item) if os.path.isfile(abs_file) and expr.match(item): # first, calculate the hash of the file: file_hash = calculate_hash(abs_file) found = False # next, try to find corresponding entry # in database for pf in processed: if pf.get_field('Hash') == file_hash: # we found corresponding record # we will leave this record unmodified updated.append(pf) found = True log.info('%s already parsed, omitting' % abs_file) if not found: try: log.info('Parsing file: %s' % abs_file) # try to open as a gzip file, and if it fails try as # a regular file try: fp = gzip.open(abs_file) parsed, total = parse_file(parser, apel_db, fp) except IOError, e: # not a gzipped file fp = open(abs_file, 'r') parsed, total = parse_file(parser, apel_db, fp) fp.close() except IOError, e: log.error('Cannot open file %s due to: %s' % (item, str(e))) except ApelDbException, e: log.error('Failed to parse %s due to a database problem: %s' % (item, e)) else:
def scan_dir(parser, dirpath, reparse, expr, apel_db, processed): ''' Check all files in a directory and parse them if: - the names match the regular expression - the file is not already in the list of processed files Add newly parsed files to the processed files list and return it. ''' log = logging.getLogger(LOGGER_ID) updated = [] try: log.info('Scanning directory: %s' % dirpath) for item in os.listdir(dirpath): abs_file = os.path.join(dirpath, item) if os.path.isfile(abs_file) and expr.match(item): # first, calculate the hash of the file: file_hash = calculate_hash(abs_file) found = False # next, try to find corresponding entry # in database for pf in processed: if pf.get_field('Hash') == file_hash: # we found corresponding record # we will leave this record unmodified updated.append(pf) found = True if reparse or not found: try: log.info('Parsing file: %s' % abs_file) # try to open as a gzip file, and if it fails try as # a regular file try: fp = gzip.open(abs_file) parsed, total = parse_file(parser, apel_db, fp, reparse) except IOError, e: # not a gzipped file fp = open(abs_file, 'r') parsed, total = parse_file(parser, apel_db, fp, reparse) fp.close() except IOError, e: log.error('Cannot open file %s due to: %s' % (item, str(e))) except ApelDbException, e: log.error('Failed to parse %s due to a database problem: %s' % (item, e)) else:
def scan_dir(parser, dirpath, reparse, expr, apel_db, processed): ''' Check all files in a directory and parse them if: - the names match the regular expression - the file is not already in the list of processed files Add newly parsed files to the processed files list and return it. ''' log = logging.getLogger(LOGGER_ID) updated = [] try: log.info('Scanning directory: %s', dirpath) for item in sorted(os.listdir(dirpath)): abs_file = os.path.join(dirpath, item) if os.path.isfile(abs_file) and expr.match(item): # first, calculate the hash of the file: file_hash = calculate_hash(abs_file) found = False unparsed = False # next, try to find corresponding entry # in database for pf in processed: if pf.get_field('Hash') == file_hash: # we found corresponding record # we will leave this record unmodified updated.append(pf) found = True # Check for zero parsed lines so we can warn later on. if pf.get_field('Parsed') == 0: unparsed = True break # If we find a match, no need to keep checking. if reparse or not found: try: log.info('Parsing file: %s', abs_file) # try to open as a bzip2 file, then as a gzip file, # and if it fails try as a regular file # # bz2/gzip doesn't raise an exception when trying # to open a non-gzip file. Only a read (such as # during parsing) does that. For files of a wrong # format we will get IOError, empty files can # give EOFError as well. for method in (bz2.BZ2File, gzip.open, open): try: # this is for Python < 2.5 try: fp = method(abs_file) parsed, total = parse_file(parser, apel_db, fp, reparse) break except (IOError, EOFError), e: if method == open: raise finally: fp.close() except IOError, e: log.error('Cannot parse file %s: %s', item, e) except ApelDbException, e: log.error('Failed to parse %s due to a database problem: %s', item, e) else:
def scan_dir(parser, dirpath, reparse, expr, apel_db, processed): ''' Check all files in a directory and parse them if: - the names match the regular expression - the file is not already in the list of processed files Add newly parsed files to the processed files list and return it. ''' log = logging.getLogger(LOGGER_ID) updated = [] parserName = parser.__class__.__name__ try: log.info('Scanning directory: %s', dirpath) for item in os.listdir(dirpath): abs_file = os.path.join(dirpath, item) if os.path.isfile(abs_file) and expr.match(item): # first, calculate the hash of the file: if parserName == "HTCondorCEParser": file_hash = "htce_" + calculate_hash(abs_file) else: file_hash = calculate_hash(abs_file) found = False unparsed = False # next, try to find corresponding entry # in database for pf in processed: if pf.get_field('Hash') == file_hash: # we found corresponding record # we will leave this record unmodified updated.append(pf) found = True # Check for zero parsed lines so we can warn later on. if pf.get_field('Parsed') == 0: unparsed = True break # If we find a match, no need to keep checking. if reparse or not found: try: log.info('Parsing file: %s', abs_file) # try to open as a bzip2 file, then as a gzip file, # and if it fails try as a regular file # # bz2/gzip doesn't raise an exception when trying # to open a non-gzip file. Only a read (such as # during parsing) does that. For files of a wrong # format we will get IOError, empty files can # give EOFError as well. for method in (bz2.BZ2File, gzip.open, open): try: # this is for Python < 2.5 try: fp = method(abs_file) parsed, total = parse_file( parser, apel_db, fp, reparse) break except (IOError, EOFError), e: if method == open: raise finally: fp.close() except IOError, e: log.error('Cannot parse file %s: %s', item, e) except ApelDbException, e: log.error( 'Failed to parse %s due to a database problem: %s', item, e) else: