Пример #1
0
 def test_calculate_hash(self):
     
     data = '01010101' * 200
     data_hash = '3d1eb00cc63828b36882f076f35c8cdd'
     
     tmpname = tempfile.mktemp('hashtest')
     fp = open(tmpname, 'w')
     fp.write(data)
     fp.close()
     
     self.assertEqual(data_hash, calculate_hash(tmpname))
     
     os.unlink(tmpname)
Пример #2
0
def scan_dir(parser, dirpath, expr, apel_db, processed):
    '''
    Check all files in a directory and parse them if:
     - the names match the regular expression
     - the file is not already in the list of processed files
     
     Add newly parsed files to the processed files list and return it.
    '''
    updated = []
    try:
        log.info('Scanning directory: %s' % dirpath)
        
        for item in os.listdir(dirpath):
            abs_file = os.path.join(dirpath, item)
            if os.path.isfile(abs_file) and expr.match(item):
                # first, calculate the hash of the file:
                file_hash = calculate_hash(abs_file)
                found = False
                # next, try to find corresponding entry
                # in database
                for pf in processed:
                    if pf.get_field('Hash') == file_hash:
                        # we found corresponding record
                        # we will leave this record unmodified
                        updated.append(pf)
                        found = True
                        log.info('%s already parsed, omitting' % abs_file)
                        
                if not found:
                    try:
                        log.info('Parsing file: %s' % abs_file)
                        # try to open as a gzip file, and if it fails try as 
                        # a regular file
                        try:
                            fp = gzip.open(abs_file)
                            parsed, total = parse_file(parser, apel_db, fp)
                        except IOError, e: # not a gzipped file
                            fp = open(abs_file, 'r')
                            parsed, total = parse_file(parser, apel_db, fp)
                            fp.close()
                    except IOError, e:
                        log.error('Cannot open file %s due to: %s' % 
                                     (item, str(e)))
                    except ApelDbException, e:
                        log.error('Failed to parse %s due to a database problem: %s' % (item, e))
                    else:
Пример #3
0
def scan_dir(parser, dirpath, reparse, expr, apel_db, processed):
    '''
    Check all files in a directory and parse them if:
     - the names match the regular expression
     - the file is not already in the list of processed files
     
     Add newly parsed files to the processed files list and return it.
    '''
    log = logging.getLogger(LOGGER_ID)
    updated = []
    try:
        log.info('Scanning directory: %s' % dirpath)
        
        for item in os.listdir(dirpath):
            abs_file = os.path.join(dirpath, item)
            if os.path.isfile(abs_file) and expr.match(item):
                # first, calculate the hash of the file:
                file_hash = calculate_hash(abs_file)
                found = False
                # next, try to find corresponding entry
                # in database
                for pf in processed:
                    if pf.get_field('Hash') == file_hash:
                        # we found corresponding record
                        # we will leave this record unmodified
                        updated.append(pf)
                        found = True
                        
                if reparse or not found:
                    try:
                        log.info('Parsing file: %s' % abs_file)
                        # try to open as a gzip file, and if it fails try as 
                        # a regular file
                        try:
                            fp = gzip.open(abs_file)
                            parsed, total = parse_file(parser, apel_db, fp, reparse)
                        except IOError, e: # not a gzipped file
                            fp = open(abs_file, 'r')
                            parsed, total = parse_file(parser, apel_db, fp, reparse)
                            fp.close()
                    except IOError, e:
                        log.error('Cannot open file %s due to: %s' % 
                                     (item, str(e)))
                    except ApelDbException, e:
                        log.error('Failed to parse %s due to a database problem: %s' % (item, e))
                    else:
Пример #4
0
def scan_dir(parser, dirpath, reparse, expr, apel_db, processed):
    '''
    Check all files in a directory and parse them if:
     - the names match the regular expression
     - the file is not already in the list of processed files
     
     Add newly parsed files to the processed files list and return it.
    '''
    log = logging.getLogger(LOGGER_ID)
    updated = []
    try:
        log.info('Scanning directory: %s', dirpath)
        
        for item in sorted(os.listdir(dirpath)):
            abs_file = os.path.join(dirpath, item)
            if os.path.isfile(abs_file) and expr.match(item):
                # first, calculate the hash of the file:
                file_hash = calculate_hash(abs_file)
                found = False
                unparsed = False
                # next, try to find corresponding entry
                # in database
                for pf in processed:
                    if pf.get_field('Hash') == file_hash:
                        # we found corresponding record
                        # we will leave this record unmodified
                        updated.append(pf)
                        found = True
                        # Check for zero parsed lines so we can warn later on.
                        if pf.get_field('Parsed') == 0:
                            unparsed = True
                        break  # If we find a match, no need to keep checking.

                if reparse or not found:
                    try:
                        log.info('Parsing file: %s', abs_file)
                        # try to open as a bzip2 file, then as a gzip file,
                        # and if it fails try as a regular file
                        #
                        # bz2/gzip doesn't raise an exception when trying
                        # to open a non-gzip file.  Only a read (such as
                        # during parsing) does that.  For files of a wrong
                        # format we will get IOError, empty files can
                        # give EOFError as well.
                        for method in (bz2.BZ2File, gzip.open, open):
                            try:  # this is for Python < 2.5
                                try:
                                    fp = method(abs_file)
                                    parsed, total = parse_file(parser, apel_db,
                                                               fp, reparse)
                                    break
                                except (IOError, EOFError), e:
                                    if method == open:
                                        raise
                            finally:
                                fp.close()
                    except IOError, e:
                        log.error('Cannot parse file %s: %s', item, e)
                    except ApelDbException, e:
                        log.error('Failed to parse %s due to a database problem: %s', item, e)
                    else:
Пример #5
0
def scan_dir(parser, dirpath, reparse, expr, apel_db, processed):
    '''
    Check all files in a directory and parse them if:
     - the names match the regular expression
     - the file is not already in the list of processed files
     
     Add newly parsed files to the processed files list and return it.
    '''
    log = logging.getLogger(LOGGER_ID)
    updated = []

    parserName = parser.__class__.__name__
    try:
        log.info('Scanning directory: %s', dirpath)

        for item in os.listdir(dirpath):
            abs_file = os.path.join(dirpath, item)
            if os.path.isfile(abs_file) and expr.match(item):
                # first, calculate the hash of the file:
                if parserName == "HTCondorCEParser":
                    file_hash = "htce_" + calculate_hash(abs_file)
                else:
                    file_hash = calculate_hash(abs_file)

                found = False
                unparsed = False
                # next, try to find corresponding entry
                # in database
                for pf in processed:
                    if pf.get_field('Hash') == file_hash:
                        # we found corresponding record
                        # we will leave this record unmodified
                        updated.append(pf)
                        found = True
                        # Check for zero parsed lines so we can warn later on.
                        if pf.get_field('Parsed') == 0:
                            unparsed = True
                        break  # If we find a match, no need to keep checking.

                if reparse or not found:
                    try:
                        log.info('Parsing file: %s', abs_file)
                        # try to open as a bzip2 file, then as a gzip file,
                        # and if it fails try as a regular file
                        #
                        # bz2/gzip doesn't raise an exception when trying
                        # to open a non-gzip file.  Only a read (such as
                        # during parsing) does that.  For files of a wrong
                        # format we will get IOError, empty files can
                        # give EOFError as well.
                        for method in (bz2.BZ2File, gzip.open, open):
                            try:  # this is for Python < 2.5
                                try:
                                    fp = method(abs_file)
                                    parsed, total = parse_file(
                                        parser, apel_db, fp, reparse)
                                    break
                                except (IOError, EOFError), e:
                                    if method == open:
                                        raise
                            finally:
                                fp.close()
                    except IOError, e:
                        log.error('Cannot parse file %s: %s', item, e)
                    except ApelDbException, e:
                        log.error(
                            'Failed to parse %s due to a database problem: %s',
                            item, e)
                    else: