def normalize_headers(header_row, long_headers, long_to_short_dict):
    """ Clean the headers (remove extra spaces and lowercase) and convert them to short headers if we're given long
        headers

        Args:
            header_row: an array of the file headers given
            long_headers: boolean indicating if we're using the long versions of the headers (True for long)
            long_to_short_dict: a dictionary containing a mapping from long headers to short ones for this file type

        Yields:
            A string containing the cleaned header name (converted to short version if long versions were provided and
            there is a mapping for that header).
    """
    for header in header_row:
        header = FieldCleaner.clean_string(header)
        # Replace headers that don't match DB but are allowed by the broker with their DB matches
        if header == 'deobligationsrecoveriesrefundsofprioryearbyprogramobjectclass_cpe':
            header = 'deobligationsrecoveriesrefundsdofprioryearbyprogramobjectclass_cpe'
        elif header == 'facevalueloanguarantee':
            header = 'facevalueofdirectloanorloanguarantee'
        elif header == 'budgetauthorityavailableamounttotal_cpe':
            header = 'totalbudgetaryresources_cpe'
        elif header == 'correctionlatedeleteindicator':
            header = 'correctiondeleteindicator'
        elif header == 'place_of_performance_zip4':
            header = 'place_of_performance_zip4a'

        # yield the short header when applicable, otherwise yield the cleaned header, whatever it is
        if long_headers and header in long_to_short_dict:
            yield FieldCleaner.clean_string(long_to_short_dict[header])
        else:
            yield header
def normalize_headers(header_row, long_headers, long_to_short_dict):
    """ Clean the headers (remove extra spaces and lowercase) and convert them to short headers if we're given long
        headers

        Args:
            header_row: an array of the file headers given
            long_headers: boolean indicating if we're using the long versions of the headers (True for long)
            long_to_short_dict: a dictionary containing a mapping from long headers to short ones for this file type

        Yields:
            A string containing the cleaned header name (converted to short version if long versions were provided and
            there is a mapping for that header).
    """
    for header in header_row:
        header = FieldCleaner.clean_name(header)
        # Replace headers that don't match DB but are allowed by the broker with their DB matches
        if header == 'deobligationsrecoveriesrefundsofprioryearbyprogramobjectclass_cpe':
            header = 'deobligationsrecoveriesrefundsdofprioryearbyprogramobjectclass_cpe'
        elif header == 'facevalueloanguarantee':
            header = 'facevalueofdirectloanorloanguarantee'
        elif header == 'budgetauthorityavailableamounttotal_cpe':
            header = 'totalbudgetaryresources_cpe'
        elif header == 'correctionlatedeleteindicator':
            header = 'correctiondeleteindicator'
        elif header == 'place_of_performance_zip4':
            header = 'place_of_performance_zip4a'

        # yield the short header when applicable, otherwise yield the cleaned header, whatever it is
        if long_headers and header in long_to_short_dict:
            yield FieldCleaner.clean_name(long_to_short_dict[header])
        else:
            yield header
def normalize_headers(header_row, long_headers, long_to_short_dict):
    for header in header_row:
        header = FieldCleaner.clean_string(header)
        # Replace correctly spelled header (which does NOT match the db) with the misspelling that DOES match the db
        if header == 'deobligationsrecoveriesrefundsofprioryearbyprogramobjectclass_cpe':
            header = 'deobligationsrecoveriesrefundsdofprioryearbyprogramobjectclass_cpe'
        if long_headers and header in long_to_short_dict:
            yield FieldCleaner.clean_string(long_to_short_dict[header])
        else:
            yield header
def normalize_headers(header_row, long_headers, long_to_short_dict):
    for header in header_row:
        header = FieldCleaner.clean_string(header)
        # Replace correctly spelled header (which does NOT match the db) with the misspelling that DOES match the db
        if header == 'deobligationsrecoveriesrefundsofprioryearbyprogramobjectclass_cpe':
            header = 'deobligationsrecoveriesrefundsdofprioryearbyprogramobjectclass_cpe'
        if header == 'facevalueloanguarantee':
            header = 'facevalueofdirectloanorloanguarantee'
        if long_headers and header in long_to_short_dict:
            yield FieldCleaner.clean_string(long_to_short_dict[header])
        else:
            yield header
示例#5
0
    def load_fields(file_type_name, schema_file_name):
        """Load specified schema from a .csv."""
        with create_app().app_context():
            sess = GlobalDB.db().session

            # get file type object for specified fileTypeName
            file_type = sess.query(FileType).filter(
                FileType.name == file_type_name).one()

            # delete existing schema from database
            SchemaLoader.remove_columns_by_file_type(sess, file_type)

            # get allowable datatypes
            type_query = sess.query(FieldType.name,
                                    FieldType.field_type_id).all()
            types = {
                data_type.name: data_type.field_type_id
                for data_type in type_query
            }

            # add schema to database
            with open(schema_file_name, 'rU') as csvfile:
                reader = csv.DictReader(csvfile)
                file_column_count = 0
                for record in reader:
                    record = FieldCleaner.clean_record(record)

                    fields = ["fieldname", "required", "data_type"]
                    if all(field in record for field in fields):
                        SchemaLoader.add_column_by_file_type(
                            sess, types, file_type,
                            FieldCleaner.clean_string(record["fieldname"]),
                            FieldCleaner.clean_string(
                                record["fieldname_short"]), record["required"],
                            record["data_type"], record["padded_flag"],
                            record["field_length"])
                        file_column_count += 1
                    else:
                        raise ValueError('CSV File does not follow schema')

                sess.commit()
                logger.info({
                    'message':
                    '{} {} schema records added to {}'.format(
                        file_column_count, file_type_name,
                        FileColumn.__tablename__),
                    'message_type':
                    'ValidatorInfo',
                    'file_type':
                    file_type.letter_name
                })
    def count_and_set_headers(self, csv_schema, header_row):
        """ Track how many times we've seen a field we were expecting and set self.expected_headers and
            self.flex_headers

            Args:
                csv_schema: list of FileColumn objects for this file type
                header_row: an array of the file headers given

            Returns:
                expected field dict {[expected field name]: [header count])
        """
        self.expected_headers = []
        self.flex_headers = []

        # Track how many times we've seen a field we were expecting. Keyed by the shorter, machine-readable column names
        expected_fields = OrderedDict()

        for schema in csv_schema:
            expected_fields[FieldCleaner.clean_name(schema.name_short)] = 0

        for header_value in header_row:
            if header_value not in expected_fields:
                # Add flex headers to flex list
                if str(header_value).startswith("flex_"):
                    self.flex_headers.append(header_value)
                else:
                    self.flex_headers.append(None)
                # Allow unexpected headers, just mark the header as None so we skip it when reading
                self.expected_headers.append(None)
            else:
                self.flex_headers.append(None)
                self.expected_headers.append(header_value)
                expected_fields[header_value] += 1
        return expected_fields
    def validateSum(value, fields_to_sum, record):
        """ Check that the value of one field is the sum of others

        :param value: The field which holds the sum we will validate against
        :param fields_to_sum: A comma separated list of fields which we should sum. These should be valid Decimals
        :param record: Record containing the data for the current record
        :return: True if the sum of fields is equal to the designated sum field
        """

        decimalValues = []

        # Validate that our sum is a decimal
        if Validator.checkType(str(value), 'DECIMAL'):
            decimalSum = Validator.getType(value, 'DECIMAL')
        else:
            return False

        # Validate each field we are summing is a decimal and store their values in an array
        for field in Validator.cleanSplit(fields_to_sum, True):
            entry = record[FieldCleaner.cleanName(field)]
            if entry is None or entry == "":
                decimalValues.append(0)
            elif Validator.checkType(entry, 'DECIMAL'):
                decimalValues.append(Validator.getType(entry, 'DECIMAL'))
            else:
                return False

        return decimalSum == sum(decimalValues)
    def conditionalRequired(cls, data,rule,datatype,interfaces,record, isList = False):
        """ If conditional rule passes, data must not be empty

        Args:
            data: Data to be checked
            rule: Rule object to test against
            datatype: Type to convert data into
            interfaces: InterfaceHolder object to the databases
            record: Some rule types require the entire record as a dict
        """
        # Get rule object for conditional rule
        conditionalRule = interfaces.validationDb.getRuleByLabel(rule.rule_text_1)
        if conditionalRule.file_column is not None:
            # This is a single field rule
            conditionalTypeId = conditionalRule.file_column.field_types_id
            conditionalDataType = interfaces.validationDb.getFieldTypeById(conditionalTypeId)
            conditionalData = record[conditionalRule.file_column.name]
        else:
            conditionalDataType = None
            conditionalData = record
        # If conditional rule passes, check that data is not empty
        if Validator.evaluateRule(conditionalData,conditionalRule,conditionalDataType,interfaces,record):
            if isList:
                # rule_text_2 is a list of fields
                fieldList = rule.rule_text_2.split(",")
                for field in fieldList:
                    if not cls.isFieldPopulated(record[FieldCleaner.cleanName(field)]):
                        # If any are empty, rule fails
                        return False
            else:
                # data is value from a single field
                return cls.isFieldPopulated(data)
        else:
            # If conditional rule fails, this field is not required, so the condtional requirement passes
            return True
    def count_and_set_headers(self, csv_schema, header_row):
        """Track how many times we've seen a field we were expecting and set self.expected_headers and
        self.flex_headers"""
        self.expected_headers = []
        self.flex_headers = []

        # Track how many times we've seen a field we were expecting. Keyed by the shorter, machine-readable column names
        expected_fields = {}

        for schema in csv_schema:
            expected_fields[FieldCleaner.clean_string(schema.name_short)] = 0

        for header_value in header_row:
            if header_value not in expected_fields:
                # Add flex headers to flex list
                if str(header_value).startswith("flex_"):
                    self.flex_headers.append(header_value)
                else:
                    self.flex_headers.append(None)
                # Allow unexpected headers, just mark the header as None so we skip it when reading
                self.expected_headers.append(None)
            else:
                self.flex_headers.append(None)
                self.expected_headers.append(header_value)
                expected_fields[header_value] += 1
        return expected_fields
示例#10
0
    def evaluateRule(cls,data,rule,datatype,interfaces,record):
        """ Checks data against specified rule

        Args:
            data: Data to be checked
            rule: Rule object to test against
            datatype: Type to convert data into
            interfaces: InterfaceHolder object to the databases
            record: Some rule types require the entire record as a dict

        Returns:
            True if rule passed, False otherwise
        """
        if data is None:
            # Treat blank as an empty string
            data = ""
        value = rule.rule_text_1
        currentRuleType = rule.rule_type.name
        # Call specific rule function
        ruleFunction = "_".join(["rule",str(currentRuleType).lower()])
        ruleFunction = FieldCleaner.cleanString(ruleFunction)
        try:
            ruleMethod = getattr(cls, str(ruleFunction))
            return ruleMethod(data, value, rule, datatype, interfaces, record)
        except AttributeError as e:
            # Unrecognized rule type
            raise ResponseException(str(e), StatusCode.INTERNAL_ERROR, ValueError)
    def count_and_set_headers(self, csv_schema, header_row):
        """Track how many times we've seen a field we were expecting and set self.expected_headers and
        self.flex_headers"""
        self.expected_headers = []
        self.flex_headers = []

        # Track how many times we've seen a field we were expecting. Keyed by the shorter, machine-readable column names
        expected_fields = {}

        for schema in csv_schema:
            expected_fields[FieldCleaner.clean_string(schema.name_short)] = 0

        for header_value in header_row:
            if header_value not in expected_fields:
                # Add flex headers to flex list
                if str(header_value).startswith("flex_"):
                    self.flex_headers.append(header_value)
                else:
                    self.flex_headers.append(None)
                # Allow unexpected headers, just mark the header as None so we skip it when reading
                self.expected_headers.append(None)
            else:
                self.flex_headers.append(None)
                self.expected_headers.append(header_value)
                expected_fields[header_value] += 1
        return expected_fields
    def read_record(self, reader, writer, row_number, job, fields, error_list):
        """ Read and process the next record

        Args:
            reader: CsvReader object
            writer: CsvWriter object
            row_number: Next row number to be read
            job: current job
            fields: List of FileColumn objects for this file type
            error_list: instance of ErrorInterface to keep track of errors

        Returns:
            Tuple with six elements:
            1. Dict of record after preprocessing
            2. Boolean indicating whether to reduce row count
            3. Boolean indicating whether to skip row
            4. Boolean indicating whether to stop reading
            5. Row error has been found
            6. Dict of flex columns
        """
        reduce_row = False
        row_error_found = False
        job_id = job.job_id
        try:
            (next_record, flex_fields) = reader.get_next_record()
            record = FieldCleaner.clean_row(
                next_record, self.long_to_short_dict[job.file_type_id], fields)
            record["row_number"] = row_number
            for flex_field in flex_fields:
                flex_field.submission_id = job.submission_id
                flex_field.job_id = job.job_id
                flex_field.row_number = row_number
                flex_field.file_type_id = job.file_type_id

            if reader.is_finished and len(record) < 2:
                # This is the last line and is empty, don't record an error
                return {}, True, True, True, False, []  # Don't count this row
        except ResponseException:
            if reader.is_finished and reader.extra_line:
                # Last line may be blank don't record an error,
                # reader.extra_line indicates a case where the last valid line has extra line breaks
                # Don't count last row if empty
                reduce_row = True
            else:
                writer.writerow([
                    "Formatting Error", ValidationError.readErrorMsg,
                    str(row_number), ""
                ])
                error_list.record_row_error(
                    job_id,
                    job.filename,
                    "Formatting Error",
                    ValidationError.readError,
                    row_number,
                    severity_id=RULE_SEVERITY_DICT['fatal'])
                row_error_found = True

            return {}, reduce_row, True, False, row_error_found, []
        return record, reduce_row, False, False, row_error_found, flex_fields
def use_long_headers(header_row, long_to_short_dict):
    """Check to see if header contains long or short column names"""
    col_matches = 0
    for value in header_row:
        if FieldCleaner.clean_string(value) in long_to_short_dict:
            col_matches += 1
    # if most of column headers are in the long format, we'll treat the file as having long headers
    return col_matches > .5 * len(header_row)
def use_long_headers(header_row, long_to_short_dict):
    """Check to see if header contains long or short column names"""
    col_matches = 0
    for value in header_row:
        if FieldCleaner.clean_string(value) in long_to_short_dict:
            col_matches += 1
    # if most of column headers are in the long format, we'll treat the file as having long headers
    return col_matches > .5 * len(header_row)
    def load_fields(file_type_name, schema_file_name):
        """Load specified schema from a .csv."""
        with create_app().app_context():
            sess = GlobalDB.db().session

            # get file type object for specified fileTypeName
            file_type = sess.query(FileType).filter(FileType.name == file_type_name).one()

            # delete existing schema from database
            SchemaLoader.remove_columns_by_file_type(sess, file_type)

            # get allowable datatypes
            type_query = sess.query(FieldType.name, FieldType.field_type_id).all()
            types = {data_type.name: data_type.field_type_id for data_type in type_query}

            # add schema to database
            with open(schema_file_name, 'rU') as csvfile:
                reader = csv.DictReader(csvfile)
                file_column_count = 0
                for record in reader:
                    record = FieldCleaner.clean_record(record)

                    fields = ["fieldname", "required", "data_type"]
                    if all(field in record for field in fields):
                        SchemaLoader.add_column_by_file_type(
                            sess,
                            types,
                            file_type,
                            FieldCleaner.clean_string(record["fieldname"]),
                            FieldCleaner.clean_string(record["fieldname_short"]),
                            record["required"],
                            record["data_type"],
                            record["padded_flag"],
                            record["field_length"])
                        file_column_count += 1
                    else:
                            raise ValueError('CSV File does not follow schema')

                sess.commit()
                logger.info({
                    'message': '{} {} schema records added to {}'.format(file_column_count, file_type_name,
                                                                         FileColumn.__tablename__),
                    'message_type': 'ValidatorInfo',
                    'file_type': file_type.letter_name
                })
示例#16
0
    def getFieldsByFileList(self, fileType):
        """ Returns a list of valid field names that can appear in this type of file

        Args:
        fileType -- One of the set of valid types of files (e.g. Award, AwardFinancial)

        Returns:
        list of names
        """
        fileId = self.getFileTypeIdByName(fileType)
        if (fileId is None):
            raise ValueError("Filetype does not exist")
        queryResult = self.session.query(FileColumn).filter(
            FileColumn.file_id == fileId).all()
        for result in queryResult:
            result.name = FieldCleaner.cleanString(
                result.name)  # Standardize field names
            result.name_short = FieldCleaner.cleanString(result.name_short)
        return queryResult
示例#17
0
    def load_labels(cls, filename):
        """Load non-SQL-based validation rules to db."""
        with create_app().app_context():
            sess = GlobalDB.db().session

            # Delete all records currently in table
            sess.query(ValidationLabel).delete()

            filename = os.path.join(cls.validation_labels_path, filename)

            # open csv
            with open(filename, 'rU') as csvfile:
                # read header
                header = csvfile.readline()
                # split header into filed names
                raw_field_names = header.split(',')
                field_names = []
                # clean field names
                for field in raw_field_names:
                    field_names.append(FieldCleaner.clean_string(field))

                unknown_fields = set(field_names) - set(cls.headers)
                if len(unknown_fields) != 0:
                    raise KeyError("".join([
                        "Found unexpected fields: ",
                        str(list(unknown_fields))
                    ]))

                missing_fields = set(cls.headers) - set(field_names)
                if len(missing_fields) != 0:
                    raise ValueError("".join([
                        "Missing required fields: ",
                        str(list(missing_fields))
                    ]))

                reader = csv.DictReader(csvfile, fieldnames=field_names)
                for row in reader:
                    validation_label = ValidationLabel(
                        label=row['label'],
                        error_message=row['error_message'],
                        column_name=row['column_name'],
                        label_type=row['label_type'])

                    # look up file type id
                    try:
                        file_id = FILE_TYPE_DICT[row["file_type"]]
                    except Exception as e:
                        raise Exception(
                            "{}: file type={}, rule label={}. Rule not loaded."
                            .format(e, row["file_type"], row["rule_label"]))

                    validation_label.file_id = file_id

                    sess.merge(validation_label)
            sess.commit()
示例#18
0
    def readRecord(self, reader, writer, fileType, interfaces, rowNumber,
                   jobId, fields):
        """ Read and process the next record

        Args:
            reader: CsvReader object
            writer: CsvWriter object
            fileType: Type of file for current job
            interfaces: InterfaceHolder object
            rowNumber: Next row number to be read
            jobId: ID of current job

        Returns:
            Tuple with four elements:
            1. Dict of record after preprocessing
            2. Boolean indicating whether to reduce row count
            3. Boolean indicating whether to skip row
            4. Boolean indicating whether to stop reading
            5. Row error has been found
        """
        errorInterface = interfaces.errorDb
        reduceRow = False
        rowErrorFound = False
        try:

            record = FieldCleaner.cleanRow(reader.getNextRecord(), fileType,
                                           interfaces.validationDb,
                                           self.longToShortDict, fields)
            record["row_number"] = rowNumber
            if reader.isFinished and len(record) < 2:
                # This is the last line and is empty, don't record an error
                return {}, True, True, True, False  # Don't count this row
        except ResponseException as e:
            if reader.isFinished and reader.extraLine:
                #Last line may be blank don't record an error, reader.extraLine indicates a case where the last valid line has extra line breaks
                # Don't count last row if empty
                reduceRow = True
            else:
                writer.write([
                    "Formatting Error", ValidationError.readErrorMsg,
                    str(rowNumber), ""
                ])
                errorInterface.recordRowError(
                    jobId,
                    self.filename,
                    "Formatting Error",
                    ValidationError.readError,
                    rowNumber,
                    severity_id=interfaces.validationDb.getRuleSeverityId(
                        "fatal"))
                rowErrorFound = True
            return {}, reduceRow, True, False, rowErrorFound
        return record, reduceRow, False, False, rowErrorFound
示例#19
0
    def insert(self, record, fileType):
        """ Write single record to this table
        Args:
        record: dict with column names as keys
        fileType: Type of file record is in

        Returns:
        True if successful
        """

        # Need to translate the provided record to use column IDs instead of field names for keys
        idRecord = {}
        for key in record:
            idRecord[str(
                self.interfaces.validationDb.getColumnId(
                    key, fileType))] = record[key]

        if (self.BATCH_INSERT):
            if (self.INSERT_BY_ORM):
                raise NotImplementedError(
                    "Have not implemented ORM method for batch insert")
            else:
                self.batch.append(idRecord)
                if (len(self.batch) > self.BATCH_SIZE):
                    # Time to write the batch
                    self.interface.connection.execute(
                        self.orm.__table__.insert(), self.batch)
                    # Reset batch
                    self.batch = []
                return True
        else:
            if (self.INSERT_BY_ORM):
                try:
                    recordOrm = self.orm()
                except:
                    # createTable was not called
                    raise Exception("Must call createTable before writing")

                attributes = self.getPublicMembers(recordOrm)

                # For each field, add value to ORM object
                for key in idRecord:
                    attr = FieldCleaner.cleanString(key)  #key.replace(" ","_")
                    setattr(recordOrm, attr, idRecord[key])

                self.interface.session.add(recordOrm)
                self.interface.session.commit()
                return True
            else:
                raise ValueError(
                    "Must do either batch or use ORM, cannot set both to False"
                )
示例#20
0
    def rule_check_prefix(cls, data, value, rule, datatype, interfaces, record):
        """ Check that 1-digit prefix is consistent with reimbursable flag """
        dataString = FieldCleaner.cleanString(data)

        # Load target field and dict to compare with
        targetField = FieldCleaner.cleanName(rule.rule_text_1)
        prefixMap = json.loads(str(rule.rule_text_2))

        # Check that character and value are consistent with dict in rule_text_2
        if dataString[0] not in prefixMap:
            # Unknown prefix, this is a failure
            return False
        source = prefixMap[dataString[0]]
        target = record[targetField]
        source = source.lower() if source is not None else source
        target = target.lower() if target is not None else target

        if source == target:
            # Matches the value in target field, rule passes
            return True
        else:
            return False
    def insert(self, record, fileType):
        """ Write single record to this table
        Args:
        record: dict with column names as keys
        fileType: Type of file record is in

        Returns:
        True if successful
        """

        # Need to translate the provided record to use column IDs instead of field names for keys
        idRecord = {}
        # Mark if header
        for key in record:
            if key == "row":
                idRecord[key] = record[key]
            else:
                idRecord[str(self.interfaces.validationDb.getColumnId(key,fileType))] = record[key]

        if(self.BATCH_INSERT):
            if(self.INSERT_BY_ORM):
                raise NotImplementedError("Have not implemented ORM method for batch insert")
            else:
                self.batch.append(idRecord)
                if(len(self.batch)>self.BATCH_SIZE):
                    # Time to write the batch
                    self.interface.connection.execute(self.orm.__table__.insert(),self.batch)
                    # Reset batch
                    self.batch = []
                return True
        else:
            if(self.INSERT_BY_ORM):
                try:
                    recordOrm = self.orm()
                except:
                    # createTable was not called
                    raise Exception("Must call createTable before writing")

                attributes = self.getPublicMembers(recordOrm)

                # For each field, add value to ORM object
                for key in idRecord:
                    attr = FieldCleaner.cleanString(key) #key.replace(" ","_")
                    setattr(recordOrm,attr,idRecord[key])

                self.interface.session.add(recordOrm)
                self.interface.session.commit()
                return True
            else:
                raise ValueError("Must do either batch or use ORM, cannot set both to False")
示例#22
0
def clean_numbers(value):
    """ Removes commas from strings representing numbers

        Args:
            value: the value to remove commas from

        Returns:
            The original value with commas removed if there were any
    """
    if value is not None:
        temp_value = str(value).replace(',', '')
        if FieldCleaner.is_numeric(temp_value):
            return temp_value
    return value
    def read_record(self, reader, writer, row_number, job, fields, error_list):
        """ Read and process the next record

        Args:
            reader: CsvReader object
            writer: CsvWriter object
            row_number: Next row number to be read
            job: current job
            fields: List of FileColumn objects for this file type
            error_list: instance of ErrorInterface to keep track of errors

        Returns:
            Tuple with six elements:
            1. Dict of record after preprocessing
            2. Boolean indicating whether to reduce row count
            3. Boolean indicating whether to skip row
            4. Boolean indicating whether to stop reading
            5. Row error has been found
            6. Dict of flex columns
        """
        reduce_row = False
        row_error_found = False
        job_id = job.job_id
        try:
            (next_record, flex_fields) = reader.get_next_record()
            record = FieldCleaner.clean_row(next_record, self.long_to_short_dict, fields)
            record["row_number"] = row_number
            for flex_field in flex_fields:
                flex_field.submission_id = job.submission_id
                flex_field.job_id = job.job_id
                flex_field.row_number = row_number
                flex_field.file_type_id = job.file_type_id

            if reader.is_finished and len(record) < 2:
                # This is the last line and is empty, don't record an error
                return {}, True, True, True, False, []  # Don't count this row
        except ResponseException:
            if reader.is_finished and reader.extra_line:
                # Last line may be blank don't record an error,
                # reader.extra_line indicates a case where the last valid line has extra line breaks
                # Don't count last row if empty
                reduce_row = True
            else:
                writer.write(["Formatting Error", ValidationError.readErrorMsg, str(row_number), ""])
                error_list.record_row_error(job_id, job.filename, "Formatting Error", ValidationError.readError,
                                            row_number, severity_id=RULE_SEVERITY_DICT['fatal'])
                row_error_found = True

            return {}, reduce_row, True, False, row_error_found, []
        return record, reduce_row, False, False, row_error_found, flex_fields
    def loadFields(fileTypeName, schemaFileName):
        """Load specified schema from a .csv."""
        with createApp().app_context():
            sess = GlobalDB.db().session

            # get file type object for specified fileTypeName
            fileType = sess.query(FileTypeValidation).filter(
                FileTypeValidation.name == fileTypeName).one()

            # delete existing schema from database
            SchemaLoader.removeColumnsByFileType(sess, fileType)

            # get allowable datatypes
            typeQuery = sess.query(FieldType.name,
                                   FieldType.field_type_id).all()
            types = {type.name: type.field_type_id for type in typeQuery}

            # add schema to database
            with open(schemaFileName, 'rU') as csvfile:
                reader = csv.DictReader(csvfile)
                for record in reader:
                    record = FieldCleaner.cleanRecord(record)

                    fields = ["fieldname", "required", "data_type"]
                    if all(field in record for field in fields):
                        SchemaLoader.addColumnByFileType(
                            sess, types, fileType,
                            FieldCleaner.cleanString(record["fieldname"]),
                            FieldCleaner.cleanString(
                                record["fieldname_short"]), record["required"],
                            record["data_type"], record["padded_flag"],
                            record["field_length"])
                    else:
                        raise ValueError('CSV File does not follow schema')

                sess.commit()
示例#25
0
    def getFieldsByFile(self, fileType, shortCols=False):
        """ Returns a dict of valid field names that can appear in this type of file

        Args:
        fileType -- One of the set of valid types of files (e.g. Award, AwardFinancial)
        shortCols -- If true, return the short column names instead of the long names

        Returns:
        dict with field names as keys and values are ORM object FileColumn
        """
        returnDict = {}
        fileId = self.getFileTypeIdByName(fileType)
        if (fileId is None):
            raise ValueError("File type does not exist")
        queryResult = self.session.query(FileColumn).options(
            subqueryload("field_type")).filter(
                FileColumn.file_id == fileId).all()
        for column in queryResult:
            if shortCols:
                returnDict[FieldCleaner.cleanString(
                    column.name_short)] = column
            else:
                returnDict[FieldCleaner.cleanString(column.name)] = column
        return returnDict
示例#26
0
    def rule_exists_in_table(cls, data, value, rule, datatype, interfaces, record):
        """ Check that field value exists in specified table, rule_text_1 has table and column to check against, rule_text_2 is length to pad to """
        ruleTextOne = str(rule.rule_text_1).split(",")
        if len(ruleTextOne) != 2:
            # Bad rule definition
            raise ResponseException("exists_in_table rule incorrectly defined, must have both table and field in rule_text_one",StatusCode.INTERNAL_ERROR,ValueError)
        # Not putting model name through FieldCleaner because model names will have uppercase
        model = getattr(domainModels,str(ruleTextOne[0]).strip())
        field = FieldCleaner.cleanString(ruleTextOne[1])
        ruleTextTwo = FieldCleaner.cleanString(rule.rule_text_2)
        if len(ruleTextTwo) == 0:
            # Skip padding
            paddedData = FieldCleaner.cleanString(data)
        else:
            # Pad data to correct length
            try:
                padLength = int(ruleTextTwo)
            except ValueError as e:
                # Need an integer in rule_text_two
                raise ResponseException("Need an integer width in rule_text_two for exists_in_table rules",StatusCode.INTERNAL_ERROR,ValueError)
            paddedData = FieldCleaner.cleanString(data).zfill(padLength)

        # Build query for model and field specified
        query = interfaces.validationDb.session.query(model).filter(getattr(model,field) == paddedData)
        try:
            # Check that value exists in table, should be unique
            interfaces.validationDb.runUniqueQuery(query,"Data not found in table", "Conflicting entries found for this data")
            # If unique result found, rule passed
            return True
        except ResponseException as e:
            # If exception is no result found, rule failed
            if type(e.wrappedException) == type(NoResultFound()):
                return False
            else:
                # This is an unexpected exception, so re-raise it
                raise
    def getFieldsByFile(self, fileType):
        """ Returns a dict of valid field names that can appear in this type of file

        Args:
        fileType -- One of the set of valid types of files (e.g. Award, AwardFinancial)

        Returns:
        dict with field names as keys and values are ORM object FileColumn
        """
        returnDict = {}
        fileId = self.getFileId(fileType)
        if(fileId is None) :
            raise ValueError("File type does not exist")
        queryResult = self.session.query(FileColumn).options(subqueryload("field_type")).filter(FileColumn.file_id == fileId).all()
        for column in queryResult :
            returnDict[FieldCleaner.cleanString(column.name)]  = column
        return returnDict
    def getFieldsByFileList(self, fileType):
        """ Returns a list of valid field names that can appear in this type of file

        Args:
        fileType -- One of the set of valid types of files (e.g. Award, AwardFinancial)

        Returns:
        list of names
        """
        fileId = self.getFileId(fileType)
        returnList  = []
        if(fileId is None) :
            raise ValueError("Filetype does not exist")
        queryResult = self.session.query(FileColumn).filter(FileColumn.file_id == fileId).all()
        for result in queryResult:
            result.name = FieldCleaner.cleanString(result.name) # Standardize field names
        return queryResult
    def load_labels(cls, filename):
        """Load non-SQL-based validation rules to db."""
        with create_app().app_context():
            sess = GlobalDB.db().session

            # Delete all records currently in table
            sess.query(ValidationLabel).delete()

            filename = os.path.join(cls.validation_labels_path, filename)

            # open csv
            with open(filename, 'rU') as csvfile:
                # read header
                header = csvfile.readline()
                # split header into filed names
                raw_field_names = header.split(',')
                field_names = []
                # clean field names
                for field in raw_field_names:
                    field_names.append(FieldCleaner.clean_string(field))

                unknown_fields = set(field_names) - set(cls.headers)
                if len(unknown_fields) != 0:
                    raise KeyError("".join(["Found unexpected fields: ", str(list(unknown_fields))]))

                missing_fields = set(cls.headers) - set(field_names)
                if len(missing_fields) != 0:
                    raise ValueError("".join(["Missing required fields: ", str(list(missing_fields))]))

                reader = csv.DictReader(csvfile, fieldnames=field_names)
                for row in reader:
                    validation_label = ValidationLabel(label=row['label'], error_message=row['error_message'],
                                                       column_name=row['column_name'], label_type=row['label_type'])

                    # look up file type id
                    try:
                        file_id = FILE_TYPE_DICT[row["file_type"]]
                    except Exception as e:
                        raise Exception("{}: file type={}, rule label={}. Rule not loaded.".format(
                            e, row["file_type"], row["rule_label"]))

                    validation_label.file_id = file_id

                    sess.merge(validation_label)
            sess.commit()
示例#30
0
    def requireOne(record, fields, interfaces):
        """ Require at least one of the specified fields to be present

        Args:
            record: Dict for current record
            fields: List of fields to check
            interfaces: interface holder for DBs

        Returns:
            True if at least one of the fields is present
        """
        for field in fields:
            fieldName = FieldCleaner.cleanName(field)
            if fieldName in record and record[fieldName] is not None and str(record[fieldName]).strip() != "":
                # If data is present in this field, rule is satisfied
                return True

        # If all were empty, return false
        return False
示例#31
0
    def open_file(self, region, bucket, filename, csv_schema, bucket_name,
                  error_filename, long_to_short_dict):
        """ Opens file and prepares to read each record, mapping entries to specified column names
        Args:
            region: AWS region where the bucket is located (not used if instantiated as CsvLocalReader)
            bucket: the S3 Bucket (not used if instantiated as CsvLocalReader)
            filename: The file path for the CSV file in S3
            csv_schema: list of FileColumn objects for this file type
            bucket_name: bucket to send errors to
            error_filename: filename for error report
            long_to_short_dict: mapping of long to short schema column names
        """

        self.filename = filename
        self.unprocessed = ''
        self.extra_line = False
        self.lines = []
        self.flex_dictionary = {}
        self.header_dictionary = {}
        self.packet_counter = 0
        current = 0
        self.is_finished = False
        self.column_count = 0
        line = self._get_line()
        # make sure we have not finished reading the file

        if self.is_finished:
            # Write header error for no header row
            with self.get_writer(bucket_name, error_filename, ["Error Type"],
                                 self.is_local) as writer:
                writer.write(["No header row"])
                writer.finishBatch()
            raise ResponseException("CSV file must have a header",
                                    StatusCode.CLIENT_ERROR, ValueError,
                                    ValidationError.singleRow)

        duplicated_headers = []
        #create the header

        # check delimiters in header row
        pipe_count = line.count("|")
        comma_count = line.count(",")

        if pipe_count != 0 and comma_count != 0:
            # Write header error for mixed delimiter use
            with self.get_writer(bucket_name, error_filename, ["Error Type"],
                                 self.is_local) as writer:
                writer.write([
                    "Cannot use both ',' and '|' as delimiters. Please choose one."
                ])
                writer.finishBatch()
            raise ResponseException(
                "Error in header row: CSV file must use only '|' or ',' as the delimiter",
                StatusCode.CLIENT_ERROR, ValueError,
                ValidationError.headerError)

        self.delimiter = "|" if line.count("|") != 0 else ","

        # Set the list of possible_fields, using  the shorter,
        # machine-readable column names
        possible_fields = {}
        for schema in csv_schema:
            possible_fields[FieldCleaner.cleanString(schema.name_short)] = 0

        for row in csv.reader([line],
                              dialect='excel',
                              delimiter=self.delimiter):
            # check to see if header contains long or short column names
            col_matches = 0
            for value in row:
                if FieldCleaner.cleanString(value) in long_to_short_dict:
                    col_matches += 1
            # if most of column headers are in the long format,
            # we'll treat the file as having long headers
            if col_matches > .5 * len(row):
                long_headers = True
            else:
                long_headers = False

            for cell in row:
                submitted_header_value = FieldCleaner.cleanString(cell)
                if long_headers and submitted_header_value in long_to_short_dict:
                    header_value = FieldCleaner.cleanString(
                        long_to_short_dict[submitted_header_value])
                elif long_headers:
                    header_value = None
                else:
                    header_value = submitted_header_value
                if not header_value in possible_fields:
                    # Add flex headers to flex list
                    if str(submitted_header_value).startswith("flex_"):
                        self.flex_dictionary[current] = submitted_header_value
                    else:
                        self.flex_dictionary[current] = None
                    # Allow unexpected headers, just mark the header as None so we skip it when reading
                    self.header_dictionary[current] = None
                    current += 1
                elif possible_fields[header_value] == 1:
                    # Add header value (as submitted) to duplicated header list
                    duplicated_headers.append(submitted_header_value)
                else:
                    self.header_dictionary[current] = header_value
                    possible_fields[header_value] = 1
                    current += 1

        self.column_count = current

        #Check that all required fields exists
        missing_headers = []
        for schema in csv_schema:
            if possible_fields[FieldCleaner.cleanString(
                    schema.name_short)] == 0:
                # return long colname for error reporting
                missing_headers.append(schema.name)

        if len(missing_headers) > 0 or len(duplicated_headers) > 0:
            # Write header errors if any occurred and raise a header_error exception
            error_string = ""
            with self.get_writer(bucket_name, error_filename,
                                 self.header_report_headers,
                                 self.is_local) as writer:
                extra_info = {}
                if len(duplicated_headers) > 0:
                    error_string = "".join([
                        error_string, "Duplicated: ",
                        ", ".join(duplicated_headers)
                    ])
                    extra_info["duplicated_headers"] = ", ".join(
                        duplicated_headers)
                    for header in duplicated_headers:
                        writer.write(["Duplicated header", header])
                if len(missing_headers) > 0:
                    if len(duplicated_headers):
                        # Separate missing and duplicated headers if both are present
                        error_string += "| "
                    error_string = "".join([
                        error_string, "Missing: ", ", ".join(missing_headers)
                    ])
                    extra_info["missing_headers"] = ", ".join(missing_headers)
                    for header in missing_headers:
                        writer.write(["Missing header", header])
                writer.finishBatch()
            raise ResponseException(
                "Errors in header row: " + str(error_string),
                StatusCode.CLIENT_ERROR, ValueError,
                ValidationError.headerError, **extra_info)

        return long_headers
    def openFile(self, region, bucket, filename, csvSchema, bucketName,
                 errorFilename):
        """ Opens file and prepares to read each record, mapping entries to specified column names
        Args:
            bucket : the S3 Bucket
            filename: The file path for the CSV file in S3
            writer: An implementation of csvAbstractWriter to send header errors to
        Returns:
        """

        possibleFields = {}
        currentFields = {}
        for schema in csvSchema:
            possibleFields[FieldCleaner.cleanString(schema.name)] = 0

        self.filename = filename
        self.unprocessed = ''
        self.extraLine = False
        self.lines = []
        self.headerDictionary = {}
        self.packetCounter = 0
        current = 0
        self.isFinished = False
        self.columnCount = 0
        line = self._getLine()
        # make sure we have not finished reading the file

        if (self.isFinished):
            raise ResponseException("CSV file must have a header",
                                    StatusCode.CLIENT_ERROR, ValueError,
                                    ValidationError.singleRow)

        duplicatedHeaders = []
        #create the header
        for row in csv.reader([line], dialect='excel'):
            for cell in row:
                headerValue = FieldCleaner.cleanString(cell)
                if (not headerValue in possibleFields):
                    # Allow unexpected headers, just mark the header as None so we skip it when reading
                    self.headerDictionary[(current)] = None
                    current += 1
                elif (possibleFields[headerValue] == 1):
                    # Add to duplicated header list
                    duplicatedHeaders.append(headerValue)
                else:
                    self.headerDictionary[(current)] = headerValue
                    possibleFields[headerValue] = 1
                    current += 1
        self.columnCount = current
        #Check that all required fields exists
        missingHeaders = []
        for schema in csvSchema:
            if (schema.required and possibleFields[FieldCleaner.cleanString(
                    schema.name)] == 0):
                missingHeaders.append(schema.name)
        if (len(missingHeaders) > 0 or len(duplicatedHeaders) > 0):
            # Write header errors if any occurred and raise a header_error exception

            with self.getWriter(bucketName, errorFilename,
                                self.headerReportHeaders,
                                self.isLocal) as writer:
                extraInfo = {}
                if (len(duplicatedHeaders) > 0):
                    extraInfo["duplicated_headers"] = ", ".join(
                        duplicatedHeaders)
                    for header in duplicatedHeaders:
                        writer.write(["Duplicated header", header])
                if (len(missingHeaders) > 0):
                    extraInfo["missing_headers"] = ", ".join(missingHeaders)
                    for header in missingHeaders:
                        writer.write(["Missing header", header])
                writer.finishBatch()
            raise ResponseException("Errors in header row",
                                    StatusCode.CLIENT_ERROR, ValueError,
                                    ValidationError.headerError, **extraInfo)
示例#33
0
    def createTable(self, fileType, filename, jobId, tableName=None):
        """ Create staging table for new file
        Args:
        fileType -- type of file to create a table for (e.g. Award, AwardFinancial)

        Returns:
        tableName if created, exception otherwise
        """
        if (tableName == None):
            tableName = self.interface.getTableName(jobId)
        self.name = tableName

        if (self.interface.tableExists(tableName)):
            # Old table still present, drop table and replace
            self.interface.dropTable(tableName)

        # Alternate way of naming tables
        #tableName = "data" + tableName.replace("/","").replace("\\","").replace(".","")
        # Write tableName to related job in job tracker

        self.interfaces.jobDb.addStagingTable(jobId, tableName)
        fields = self.interfaces.validationDb.getFieldsByFile(fileType)
        """ Might not need sequence for ORM
        # Create sequence to be used for primary key
        sequenceName = tableName + "Serial"
        sequenceStatement = "CREATE SEQUENCE " + sequenceName + " START 1"
        try:
            self.runStatement(sequenceStatement)
        except ProgrammingError:
            # Sequence already exists
            pass
        """
        primaryAssigned = False
        # Create empty dict for field names and values
        classFieldDict = {"__tablename__": tableName}
        # Add each column
        for key in fields:
            # Build column statement for this key
            # Create cleaned version of key
            newKey = str(fields[key].file_column_id)
            # Get correct type name
            fieldTypeName = FieldCleaner.cleanString(
                fields[key].field_type.name)
            if (fieldTypeName == "string"):
                fieldTypeName = Text
            elif (fieldTypeName == "int"):
                fieldTypeName = Integer
            elif (fieldTypeName == "decimal"):
                fieldTypeName = Numeric
            elif (fieldTypeName == "boolean"):
                fieldTypeName = Boolean
            elif (fieldTypeName == "long"):
                fieldTypeName = BigInteger
            else:
                raise ValueError("Bad field type")
            # Get extra parameters (primary key or not null)
            extraParam = ""
            if (FieldCleaner.cleanString(
                    fields[key].field_type.description) == "primary_key"):
                classFieldDict[newKey] = Column(fieldTypeName,
                                                primary_key=True)
                primaryAssigned = True
            elif (fields[key].required):
                classFieldDict[newKey] = Column(fieldTypeName, nullable=False)
            else:
                classFieldDict[newKey] = Column(fieldTypeName)

        if (not primaryAssigned):
            # If no primary key assigned, add one based on table name
            classFieldDict["".join([tableName,
                                    "id"])] = Column(Integer, primary_key=True)

        # Create ORM class based on dict
        self.orm = type(tableName, (declarative_base(), ), classFieldDict)
        self.jobId = jobId

        # Create table
        self.orm.__table__.create(self.interface.engine)
    def load_sql(cls, filename):
        """Load SQL-based validation rules to db."""
        with create_app().app_context():
            sess = GlobalDB.db().session

            # Delete all records currently in table
            sess.query(RuleSql).delete()

            filename = os.path.join(cls.sql_rules_path, filename)

            # open csv
            with open(filename, 'rU') as csvfile:
                # read header
                header = csvfile.readline()
                # split header into filed names
                raw_field_names = header.split(',')
                field_names = []
                # clean field names
                for field in raw_field_names:
                    field_names.append(FieldCleaner.clean_string(field))

                unknown_fields = set(field_names) - set(cls.headers)
                if len(unknown_fields) != 0:
                    raise KeyError("".join(["Found unexpected fields: ", str(list(unknown_fields))]))

                missing_fields = set(cls.headers) - set(field_names)
                if len(missing_fields) != 0:
                    raise ValueError("".join(["Missing required fields: ", str(list(missing_fields))]))

                reader = csv.DictReader(csvfile, fieldnames=field_names)
                for row in reader:
                    sql = cls.read_sql_str(row['query_name'])

                    rule_sql = RuleSql(rule_sql=sql, rule_label=row['rule_label'],
                                       rule_error_message=row['rule_error_message'], query_name=row['query_name'])

                    # look up file type id
                    try:
                        file_id = FILE_TYPE_DICT[row["file_type"]]
                    except Exception as e:
                        raise Exception("{}: file type={}, rule label={}. Rule not loaded.".format(
                            e, row["file_type"], row["rule_label"]))
                    try:
                        if row["target_file"].strip() == "":
                            # No target file provided
                            target_file_id = None
                        else:
                            target_file_id = FILE_TYPE_DICT[row["target_file"]]
                    except Exception as e:
                        raise Exception("{}: file type={}, rule label={}. Rule not loaded.".format(
                            e, row["target_file"], row["rule_label"]))

                    # set cross file flag
                    flag = FieldCleaner.clean_string(row["rule_cross_file_flag"])
                    if flag in ('true', 't', 'y', 'yes'):
                        cross_file_flag = True
                    else:
                        cross_file_flag = False

                    rule_sql.rule_severity_id = RULE_SEVERITY_DICT[row['severity_name']]
                    rule_sql.file_id = file_id
                    rule_sql.target_file_id = target_file_id
                    rule_sql.rule_cross_file_flag = cross_file_flag

                    sess.merge(rule_sql)
            sess.commit()
    def loadCsv(cls,filename,model,interface,fieldMap,fieldOptions):
        """ Loads a table based on a csv

        Args:
            filename: CSV to load
            model: ORM object for table to be loaded
            interface: interface to DB table is in
            fieldMap: dict that maps columns of the csv to attributes of the ORM object
            fieldOptions: dict with keys of attribute names, value contains a dict with options for that attribute.
                Current options are "pad_to_length" which if present will pad the field with leading zeros up to
                specified length, and "skip_duplicate" which ignores subsequent lines that repeat values.
        """
        # Delete all records currently in table
        interface.session.query(model).delete()
        interface.session.commit()
        valuePresent = {}
        # Open csv
        with open(filename,'rU') as csvfile:
            # Read header
            header = csvfile.readline()
            # Split header into fieldnames
            rawFieldNames = header.split(",")
            fieldNames = []
            # Clean field names
            for field in rawFieldNames:
                fieldNames.append(FieldCleaner.cleanString(field))
            # Map fieldnames to attribute names
            attributeNames = []
            for field in fieldNames:
                if field in fieldMap:
                    attributeNames.append(fieldMap[field])
                    if fieldMap[field] in fieldOptions and "skip_duplicates" in fieldOptions[fieldMap[field]]:
                        # Create empty dict for this field
                        valuePresent[fieldMap[field]] = {}
                else:
                    raise KeyError("".join(["Found unexpected field ", str(field)]))
            # Check that all fields are present
            for field in fieldMap:
                if not field in fieldNames:
                    raise ValueError("".join([str(field)," is required for loading table ", str(type(model))]))
            # Open DictReader with attribute names
            reader = csv.DictReader(csvfile,fieldnames = attributeNames)
            # For each row, create instance of model and add it
            for row in reader:
                skipInsert = False
                for field in fieldOptions:
                    # For each field with options present, modify according to those options
                    options = fieldOptions[field]
                    if "pad_to_length" in options:
                        padLength = options["pad_to_length"]
                        row[field] = Validator.padToLength(row[field],padLength)
                    if "skip_duplicates" in options:
                        if len(row[field].strip()) == 0 or row[field] in valuePresent[field]:
                            # Value not provided or already exists, skip it
                            skipInsert = True
                        else:
                            # Insert new value
                            valuePresent[field][row[field]] = True
                    record = model(**row)
                if not skipInsert:
                    try:
                        interface.session.merge(record)
                    except IntegrityError as e:
                        # Hit a duplicate value that violates index, skip this one
                        print("".join(["Warning: Skipping this row: ",str(row)]))
                        print("".join(["Due to error: ",str(e)]))
                        interface.session.rollback()
                        continue
            interface.session.commit()
    def loadCsv(cls, filename, model, interface, fieldMap, fieldOptions):
        """ Loads a table based on a csv

        Args:
            filename: CSV to load
            model: ORM object for table to be loaded
            interface: interface to DB table is in
            fieldMap: dict that maps columns of the csv to attributes of the ORM object
            fieldOptions: dict with keys of attribute names, value contains a dict with options for that attribute.
                Current options are "pad_to_length" which if present will pad the field with leading zeros up to
                specified length, and "skip_duplicate" which ignores subsequent lines that repeat values.
        """
        # Delete all records currently in table
        interface.session.query(model).delete()
        interface.session.commit()
        valuePresent = {}
        # Open csv
        with open(filename, 'rU') as csvfile:
            # Read header
            header = csvfile.readline()
            # Split header into fieldnames
            rawFieldNames = header.split(",")
            fieldNames = []
            # Clean field names
            for field in rawFieldNames:
                fieldNames.append(FieldCleaner.cleanString(field))
            # Map fieldnames to attribute names
            attributeNames = []
            for field in fieldNames:
                if field in fieldMap:
                    attributeNames.append(fieldMap[field])
                    if fieldMap[
                            field] in fieldOptions and "skip_duplicates" in fieldOptions[
                                fieldMap[field]]:
                        # Create empty dict for this field
                        valuePresent[fieldMap[field]] = {}
                else:
                    raise KeyError("".join(
                        ["Found unexpected field ",
                         str(field)]))
            # Check that all fields are present
            for field in fieldMap:
                if not field in fieldNames:
                    raise ValueError("".join([
                        str(field), " is required for loading table ",
                        str(type(model))
                    ]))
            # Open DictReader with attribute names
            reader = csv.DictReader(csvfile, fieldnames=attributeNames)
            # For each row, create instance of model and add it
            for row in reader:
                skipInsert = False
                for field in fieldOptions:
                    # For each field with options present, modify according to those options
                    options = fieldOptions[field]
                    if "pad_to_length" in options:
                        padLength = options["pad_to_length"]
                        row[field] = Validator.padToLength(
                            row[field], padLength)
                    if "skip_duplicates" in options:
                        if len(row[field].strip()
                               ) == 0 or row[field] in valuePresent[field]:
                            # Value not provided or already exists, skip it
                            skipInsert = True
                        else:
                            # Insert new value
                            valuePresent[field][row[field]] = True
                    record = model(**row)
                if not skipInsert:
                    try:
                        interface.session.merge(record)
                    except IntegrityError as e:
                        # Hit a duplicate value that violates index, skip this one
                        print("".join(
                            ["Warning: Skipping this row: ",
                             str(row)]))
                        print("".join(["Due to error: ", str(e)]))
                        interface.session.rollback()
                        continue
            interface.session.commit()
    def runValidation(self, jobId, interfaces):
        """ Run validations for specified job
        Args:
            jobId: Job to be validated
            jobTracker: Interface for job tracker
        Returns:
            True if successful
        """
        jobTracker = interfaces.jobDb
        rowNumber = 1
        fileType = jobTracker.getFileType(jobId)
        # If local, make the error report directory
        if(self.isLocal and not os.path.exists(self.directory)):
            os.makedirs(self.directory)
        # Get bucket name and file name
        fileName = jobTracker.getFileName(jobId)
        self.filename = fileName
        bucketName = CONFIG_BROKER['aws_bucket']
        regionName = CONFIG_BROKER['aws_region']

        errorFileName = self.getFileName(jobTracker.getReportPath(jobId))

        # Create File Status object
        interfaces.errorDb.createFileIfNeeded(jobId,fileName)

        validationDB = interfaces.validationDb
        fieldList = validationDB.getFieldsByFileList(fileType)
        csvSchema  = validationDB.getFieldsByFile(fileType)
        rules = validationDB.getRulesByFile(fileType)

        reader = self.getReader()

        # Get file size and write to jobs table
        if(CONFIG_BROKER["use_aws"]):
            fileSize =  s3UrlHandler.getFileSize("errors/"+jobTracker.getReportPath(jobId))
        else:
            fileSize = os.path.getsize(jobTracker.getFileName(jobId))
        jobTracker.setFileSizeById(jobId, fileSize)


        try:
            # Pull file
            reader.openFile(regionName, bucketName, fileName,fieldList,bucketName,errorFileName)
            # Create staging table

            tableName = interfaces.stagingDb.getTableName(jobId)
            # Create staging table
            tableObject = StagingTable(interfaces)
            tableObject.createTable(fileType,fileName,jobId,tableName)
            errorInterface = interfaces.errorDb

            # While not done, pull one row and put it into staging if it passes
            # the Validator
            with self.getWriter(regionName, bucketName, errorFileName, self.reportHeaders) as writer:
                while(not reader.isFinished):
                    rowNumber += 1
                    #if (rowNumber % 1000) == 0:
                    #    print("Validating row " + str(rowNumber))
                    try :
                        record = FieldCleaner.cleanRow(reader.getNextRecord(), fileType, validationDB)
                        record["row"] = rowNumber
                        if(reader.isFinished and len(record) < 2):
                            # This is the last line and is empty, don't record an error
                            rowNumber -= 1 # Don't count this row
                            break
                    except ResponseException as e:
                        if reader.isFinished and reader.extraLine:
                            #Last line may be blank don't record an error, reader.extraLine indicates a case where the last valid line has extra line breaks
                            # Don't count last row if empty
                            rowNumber -= 1
                        else:
                            writer.write(["Formatting Error", ValidationError.readErrorMsg, str(rowNumber), ""])
                            errorInterface.recordRowError(jobId,self.filename,"Formatting Error",ValidationError.readError,rowNumber)
                            errorInterface.setRowErrorsPresent(jobId, True)
                        continue
                    valid, failures = Validator.validate(record,rules,csvSchema,fileType,interfaces)
                    if(valid) :
                        try:
                            tableObject.insert(record,fileType)
                        except ResponseException as e:
                            # Write failed, move to next record
                            writer.write(["Formatting Error", ValidationError.writeErrorMsg, str(rowNumber),""])
                            errorInterface.recordRowError(jobId,self.filename,"Formatting Error",ValidationError.writeError,rowNumber)
                            errorInterface.setRowErrorsPresent(jobId, True)
                            continue

                    else:
                        # For each failure, record it in error report and metadata
                        if failures:
                            errorInterface.setRowErrorsPresent(jobId, True)
                        for failure in failures:
                            fieldName = failure[0]
                            error = failure[1]
                            failedValue = failure[2]
                            try:
                                # If error is an int, it's one of our prestored messages
                                errorType = int(error)
                                errorMsg = ValidationError.getErrorMessage(errorType)
                            except ValueError:
                                # If not, treat it literally
                                errorMsg = error
                            writer.write([fieldName,errorMsg,str(rowNumber),failedValue])
                            errorInterface.recordRowError(jobId,self.filename,fieldName,error,rowNumber)
                # Write unfinished batch
                writer.finishBatch()

            # Write number of rows to job table
            jobTracker.setNumberOfRowsById(jobId,rowNumber)
            # Write leftover records
            tableObject.endBatch()
            # Mark validation as finished in job tracker
            jobTracker.markJobStatus(jobId,"finished")
            errorInterface.writeAllRowErrors(jobId)
        finally:
            #ensure the file always closes
            reader.close()
        return True
    def runValidation(self, jobId, interfaces):
        """ Run validations for specified job
        Args:
            jobId: Job to be validated
            jobTracker: Interface for job tracker
        Returns:
            True if successful
        """
        jobTracker = interfaces.jobDb
        rowNumber = 1
        fileType = jobTracker.getFileType(jobId)
        # If local, make the error report directory
        if (self.isLocal and not os.path.exists(self.directory)):
            os.makedirs(self.directory)
        # Get bucket name and file name
        fileName = jobTracker.getFileName(jobId)
        self.filename = fileName
        bucketName = CONFIG_BROKER['aws_bucket']
        regionName = CONFIG_BROKER['aws_region']

        errorFileName = self.getFileName(jobTracker.getReportPath(jobId))

        # Create File Status object
        interfaces.errorDb.createFileIfNeeded(jobId, fileName)

        validationDB = interfaces.validationDb
        fieldList = validationDB.getFieldsByFileList(fileType)
        csvSchema = validationDB.getFieldsByFile(fileType)
        rules = validationDB.getRulesByFile(fileType)

        reader = self.getReader()

        # Get file size and write to jobs table
        if (CONFIG_BROKER["use_aws"]):
            fileSize = s3UrlHandler.getFileSize(
                "errors/" + jobTracker.getReportPath(jobId))
        else:
            fileSize = os.path.getsize(jobTracker.getFileName(jobId))
        jobTracker.setFileSizeById(jobId, fileSize)

        try:
            # Pull file
            reader.openFile(regionName, bucketName, fileName, fieldList,
                            bucketName, errorFileName)
            # Create staging table

            tableName = interfaces.stagingDb.getTableName(jobId)
            # Create staging table
            tableObject = StagingTable(interfaces)
            tableObject.createTable(fileType, fileName, jobId, tableName)
            errorInterface = interfaces.errorDb

            # While not done, pull one row and put it into staging if it passes
            # the Validator
            with self.getWriter(regionName, bucketName, errorFileName,
                                self.reportHeaders) as writer:
                while (not reader.isFinished):
                    rowNumber += 1
                    #if (rowNumber % 1000) == 0:
                    #    print("Validating row " + str(rowNumber))
                    try:
                        record = FieldCleaner.cleanRow(reader.getNextRecord(),
                                                       fileType, validationDB)
                        record["row"] = rowNumber
                        if (reader.isFinished and len(record) < 2):
                            # This is the last line and is empty, don't record an error
                            rowNumber -= 1  # Don't count this row
                            break
                    except ResponseException as e:
                        if reader.isFinished and reader.extraLine:
                            #Last line may be blank don't record an error, reader.extraLine indicates a case where the last valid line has extra line breaks
                            # Don't count last row if empty
                            rowNumber -= 1
                        else:
                            writer.write([
                                "Formatting Error",
                                ValidationError.readErrorMsg,
                                str(rowNumber), ""
                            ])
                            errorInterface.recordRowError(
                                jobId, self.filename, "Formatting Error",
                                ValidationError.readError, rowNumber)
                            errorInterface.setRowErrorsPresent(jobId, True)
                        continue
                    valid, failures = Validator.validate(
                        record, rules, csvSchema, fileType, interfaces)
                    if (valid):
                        try:
                            tableObject.insert(record, fileType)
                        except ResponseException as e:
                            # Write failed, move to next record
                            writer.write([
                                "Formatting Error",
                                ValidationError.writeErrorMsg,
                                str(rowNumber), ""
                            ])
                            errorInterface.recordRowError(
                                jobId, self.filename, "Formatting Error",
                                ValidationError.writeError, rowNumber)
                            errorInterface.setRowErrorsPresent(jobId, True)
                            continue

                    else:
                        # For each failure, record it in error report and metadata
                        if failures:
                            errorInterface.setRowErrorsPresent(jobId, True)
                        for failure in failures:
                            fieldName = failure[0]
                            error = failure[1]
                            failedValue = failure[2]
                            try:
                                # If error is an int, it's one of our prestored messages
                                errorType = int(error)
                                errorMsg = ValidationError.getErrorMessage(
                                    errorType)
                            except ValueError:
                                # If not, treat it literally
                                errorMsg = error
                            writer.write([
                                fieldName, errorMsg,
                                str(rowNumber), failedValue
                            ])
                            errorInterface.recordRowError(
                                jobId, self.filename, fieldName, error,
                                rowNumber)
                # Write unfinished batch
                writer.finishBatch()

            # Write number of rows to job table
            jobTracker.setNumberOfRowsById(jobId, rowNumber)
            # Write leftover records
            tableObject.endBatch()
            # Mark validation as finished in job tracker
            jobTracker.markJobStatus(jobId, "finished")
            errorInterface.writeAllRowErrors(jobId)
        finally:
            #ensure the file always closes
            reader.close()
        return True
示例#39
0
 def rule_sum_fields(cls, data, value, rule, datatype, interfaces, record):
     """Checks that set of fields sums to value in other field"""
     valueToMatch = record[FieldCleaner.cleanName(rule.rule_text_1)]
     if valueToMatch is None or valueToMatch == "":
         valueToMatch = 0
     return cls.validateSum(valueToMatch, rule.rule_text_2, record)
示例#40
0
    def load_sql(cls, filename):
        """Load SQL-based validation rules to db."""
        with create_app().app_context():
            sess = GlobalDB.db().session

            # Delete all records currently in table
            sess.query(RuleSql).delete()

            filename = os.path.join(cls.sql_rules_path, filename)

            # open csv
            with open(filename, 'rU') as csvfile:
                # read header
                header = csvfile.readline()
                # split header into filed names
                raw_field_names = header.split(',')
                field_names = []
                # clean field names
                for field in raw_field_names:
                    field_names.append(FieldCleaner.clean_string(field))

                unknown_fields = set(field_names) - set(cls.headers)
                if len(unknown_fields) != 0:
                    raise KeyError("".join([
                        "Found unexpected fields: ",
                        str(list(unknown_fields))
                    ]))

                missing_fields = set(cls.headers) - set(field_names)
                if len(missing_fields) != 0:
                    raise ValueError("".join([
                        "Missing required fields: ",
                        str(list(missing_fields))
                    ]))

                reader = csv.DictReader(csvfile, fieldnames=field_names)
                for row in reader:
                    sql = cls.read_sql_str(row['query_name'])

                    rule_sql = RuleSql(
                        rule_sql=sql,
                        rule_label=row['rule_label'],
                        rule_error_message=row['rule_error_message'],
                        query_name=row['query_name'])

                    # look up file type id
                    try:
                        file_id = FILE_TYPE_DICT[row["file_type"]]
                    except Exception as e:
                        raise Exception(
                            "{}: file type={}, rule label={}. Rule not loaded."
                            .format(e, row["file_type"], row["rule_label"]))
                    try:
                        if row["target_file"].strip() == "":
                            # No target file provided
                            target_file_id = None
                        else:
                            target_file_id = FILE_TYPE_DICT[row["target_file"]]
                    except Exception as e:
                        raise Exception(
                            "{}: file type={}, rule label={}. Rule not loaded."
                            .format(e, row["target_file"], row["rule_label"]))

                    # set cross file flag
                    flag = FieldCleaner.clean_string(
                        row["rule_cross_file_flag"])
                    if flag in ('true', 't', 'y', 'yes'):
                        cross_file_flag = True
                    else:
                        cross_file_flag = False

                    rule_sql.rule_severity_id = RULE_SEVERITY_DICT[
                        row['severity_name']]
                    rule_sql.file_id = file_id
                    rule_sql.target_file_id = target_file_id
                    rule_sql.rule_cross_file_flag = cross_file_flag

                    sess.merge(rule_sql)
            sess.commit()
    def process_data_chunk(self, sess, chunk_df):
        """ Loads in a chunk of the file and performs initial validations

            Args:
                sess: the database connection
                bucket_name: the bucket to pull the file
                region_name: the region to pull the file
        """
        logger.info({
            'message': 'Loading rows starting from {}'.format(self.max_row_number + 1),
            'message_type': 'ValidatorInfo',
            'submission_id': self.submission_id,
            'job_id': self.job.job_id,
            'file_type': self.file_type.name,
            'action': 'data_loading',
            'status': 'start'
        })

        # initializing warning/error files and dataframes
        total_errors = pd.DataFrame(columns=self.report_headers)
        total_warnings = pd.DataFrame(columns=self.report_headers)
        flex_data = None
        required_list = {}
        type_list = {}
        office_list = {}

        # Replace whatever the user included so we're using the database headers
        chunk_df.rename(columns=self.reader.header_dict, inplace=True)

        empty_file = chunk_df.empty

        if not empty_file:
            chunk_df = chunk_df.applymap(clean_col)

            # Adding row number
            chunk_df = chunk_df.reset_index()
            # index gets reset for each chunk, adding the header, and adding previous rows
            chunk_df['row_number'] = chunk_df.index + 1 + self.max_row_number
            self.total_rows += len(chunk_df.index)

            # Increment row numbers if any were ignored being too long
            # This syncs the row numbers back to their original values
            for row in sorted(self.long_rows):
                chunk_df.loc[chunk_df['row_number'] >= row, 'row_number'] = chunk_df['row_number'] + 1

            # Setting max row number for chunking purposes
            self.max_row_number = chunk_df['row_number'].max()

            # Filtering out already processed long rows
            self.long_rows = [row for row in self.long_rows if row > self.max_row_number]

            # Drop rows that were too short and pandas filled in with Nones
            chunk_df = chunk_df[~chunk_df['row_number'].isin(self.short_rows)]

            # Drop the index column
            chunk_df = chunk_df.drop(['index'], axis=1)

            # Drop all rows that have 1 or less filled in values (row_number is always filled in so this is how
            # we have to drop all rows that are just empty)
            chunk_df.dropna(thresh=2, inplace=True)
            empty_file = chunk_df.empty

        if not empty_file:
            if self.is_fabs:
                # create a list of all required/type labels for FABS
                labels = sess.query(ValidationLabel).all()
                for label in labels:
                    if label.label_type == 'requirement':
                        required_list[label.column_name] = label.label
                    else:
                        type_list[label.column_name] = label.label

                # Create a list of all offices
                offices = sess.query(Office.office_code, Office.sub_tier_code).all()
                for office in offices:
                    office_list[office.office_code] = office.sub_tier_code
                # Clear out office list to save space
                del offices

            # Gathering flex data (must be done before chunk limiting)
            if self.reader.flex_fields:
                flex_data = chunk_df.loc[:, list(self.reader.flex_fields + ['row_number'])]
            if flex_data is not None and not flex_data.empty:
                flex_data['concatted'] = flex_data.apply(lambda x: concat_flex(x), axis=1)

            # Dropping any extraneous fields included + flex data (must be done before file type checking)
            chunk_df = chunk_df[list(self.expected_headers + ['row_number'])]

            # Only do validations if it's not a D file
            if self.file_type.name not in ['award', 'award_procurement']:

                # Padding specific fields
                for field in self.parsed_fields['padded']:
                    chunk_df[field] = chunk_df.apply(
                        lambda x: FieldCleaner.pad_field(self.csv_schema[field], x[field]), axis=1)
                # Cleaning up numbers so they can be inserted properly
                for field in self.parsed_fields['number']:
                    chunk_df[field] = chunk_df.apply(lambda x: clean_numbers(x[field]), axis=1)

                if self.is_fabs:
                    chunk_df['is_valid'] = True
                    chunk_df['awarding_sub_tier_agency_c'] = chunk_df.apply(
                        lambda x: derive_fabs_awarding_sub_tier(x, office_list), axis=1)
                    chunk_df['afa_generated_unique'] = chunk_df.apply(
                        lambda x: derive_fabs_afa_generated_unique(x), axis=1)
                    chunk_df['unique_award_key'] = chunk_df.apply(
                        lambda x: derive_fabs_unique_award_key(x), axis=1)
                else:
                    chunk_df['tas'] = chunk_df.apply(lambda x: concat_tas_dict(x), axis=1)
                    chunk_df['display_tas'] = chunk_df.apply(lambda x: concat_display_tas_dict(x), axis=1)
                chunk_df['unique_id'] = chunk_df.apply(lambda x: derive_unique_id(x, self.is_fabs), axis=1)

                # Separate each of the checks to their own dataframes, then concat them together
                req_errors = check_required(chunk_df, self.parsed_fields['required'], required_list,
                                            self.report_headers, self.short_to_long_dict[self.file_type.file_type_id],
                                            flex_data, is_fabs=self.is_fabs)
                type_errors = check_type(chunk_df, self.parsed_fields['number'] + self.parsed_fields['boolean'],
                                         type_list, self.report_headers, self.csv_schema,
                                         self.short_to_long_dict[self.file_type.file_type_id], flex_data,
                                         is_fabs=self.is_fabs)
                type_error_rows = type_errors['Row Number'].tolist()
                length_errors = check_length(chunk_df, self.parsed_fields['length'], self.report_headers,
                                             self.csv_schema, self.short_to_long_dict[self.file_type.file_type_id],
                                             flex_data, type_error_rows)

                if self.is_fabs:
                    error_dfs = [req_errors, type_errors, length_errors]
                    warning_dfs = [pd.DataFrame(columns=list(self.report_headers + ['error_type']))]
                else:
                    error_dfs = [req_errors, type_errors]
                    warning_dfs = [length_errors]

                total_errors = pd.concat(error_dfs, ignore_index=True)
                total_warnings = pd.concat(warning_dfs, ignore_index=True)

                # Converting these to ints because pandas likes to change them to floats randomly
                total_errors[['Row Number', 'error_type']] = total_errors[['Row Number', 'error_type']].astype(int)
                total_warnings[['Row Number', 'error_type']] = total_warnings[['Row Number', 'error_type']]. \
                    astype(int)

                self.error_rows.extend([int(x) for x in total_errors['Row Number'].tolist()])

                for index, row in total_errors.iterrows():
                    self.error_list.record_row_error(self.job.job_id, self.file_name, row['Field Name'],
                                                     row['error_type'], row['Row Number'], row['Rule Label'],
                                                     self.file_type.file_type_id, None, RULE_SEVERITY_DICT['fatal'])

                for index, row in total_warnings.iterrows():
                    self.error_list.record_row_error(self.job.job_id, self.file_name, row['Field Name'],
                                                     row['error_type'], row['Row Number'], row['Rule Label'],
                                                     self.file_type.file_type_id, None, RULE_SEVERITY_DICT['warning'])

                total_errors.drop(['error_type'], axis=1, inplace=True, errors='ignore')
                total_warnings.drop(['error_type'], axis=1, inplace=True, errors='ignore')

                # Remove type error rows from original dataframe
                chunk_df = chunk_df[~chunk_df['row_number'].isin(type_error_rows)]
                chunk_df.drop(['unique_id'], axis=1, inplace=True)

        # Write all the errors/warnings to their files
        total_errors.to_csv(self.error_file_path, columns=self.report_headers, index=False, quoting=csv.QUOTE_ALL,
                            mode='a', header=False)
        total_warnings.to_csv(self.warning_file_path, columns=self.report_headers, index=False,
                              quoting=csv.QUOTE_ALL, mode='a', header=False)

        # Finally load the data into the database
        if not empty_file:
            # The model data
            now = datetime.now()
            chunk_df['created_at'] = now
            chunk_df['updated_at'] = now
            chunk_df['job_id'] = self.job.job_id
            chunk_df['submission_id'] = self.submission_id
            insert_dataframe(chunk_df, self.model.__table__.name, sess.connection())

            # Flex Fields
            if flex_data is not None:
                flex_data.drop(['concatted'], axis=1, inplace=True)
                flex_data = flex_data[flex_data['row_number'].isin(chunk_df['row_number'])]

                flex_rows = pd.melt(flex_data, id_vars=['row_number'], value_vars=self.reader.flex_fields,
                                    var_name='header', value_name='cell')

                # Filling in all the shared data for these flex fields
                now = datetime.now()
                flex_rows['created_at'] = now
                flex_rows['updated_at'] = now
                flex_rows['job_id'] = self.job.job_id
                flex_rows['submission_id'] = self.submission_id
                flex_rows['file_type_id'] = self.file_type.file_type_id

                # Adding the entire set of flex fields
                insert_dataframe(flex_rows, FlexField.__table__.name, sess.connection())
        sess.commit()

        logger.info({
            'message': 'Loaded rows up to {}'.format(self.max_row_number),
            'message_type': 'ValidatorInfo',
            'submission_id': self.submission_id,
            'job_id': self.job.job_id,
            'file_type': self.file_type.name,
            'action': 'data_loading',
            'status': 'end'
        })
示例#42
0
    def openFile(self,region,bucket,filename,csvSchema,bucketName,errorFilename):
        """ Opens file and prepares to read each record, mapping entries to specified column names
        Args:
            region: AWS region where the bucket is located (not used if instantiated as CsvLocalReader)
            bucket: the S3 Bucket (not used if instantiated as CsvLocalReader)
            filename: The file path for the CSV file in S3
            csvSchema: list of FileColumn objects for this file type
            bucketName: bucket to send errors to
            errorFilename: filename for error report
        """


        possibleFields = {}
        currentFields = {}
        for schema in  csvSchema:
                possibleFields[FieldCleaner.cleanString(schema.name)] = 0

        self.filename = filename
        self.unprocessed = ''
        self.extraLine = False
        self.lines = []
        self.headerDictionary = {}
        self.packetCounter = 0
        current = 0
        self.isFinished = False
        self.columnCount = 0
        line = self._getLine()
        # make sure we have not finished reading the file

        if(self.isFinished) :
            # Write header error for no header row
            with self.getWriter(bucketName, errorFilename, ["Error Type"], self.isLocal) as writer:
                writer.write(["No header row"])
                writer.finishBatch()
            raise ResponseException("CSV file must have a header",StatusCode.CLIENT_ERROR,ValueError,ValidationError.singleRow)

        duplicatedHeaders = []
        #create the header

        # check delimiters in header row
        pipeCount = line.count("|")
        commaCount = line.count(",")

        if pipeCount != 0 and commaCount != 0:
            # Write header error for mixed delimiter use
            with self.getWriter(bucketName, errorFilename, ["Error Type"], self.isLocal) as writer:
                writer.write(["Cannot use both ',' and '|' as delimiters. Please choose one."])
                writer.finishBatch()
            raise ResponseException("Error in header row: CSV file must use only '|' or ',' as the delimiter", StatusCode.CLIENT_ERROR, ValueError, ValidationError.headerError)

        self.delimiter = "|" if line.count("|") != 0 else ","
        for row in csv.reader([line],dialect='excel', delimiter=self.delimiter):
            for cell in row :
                headerValue = FieldCleaner.cleanString(cell)
                if( not headerValue in possibleFields) :
                    # Allow unexpected headers, just mark the header as None so we skip it when reading
                    self.headerDictionary[(current)] = None
                    current += 1
                elif(possibleFields[headerValue] == 1) :
                    # Add to duplicated header list
                    duplicatedHeaders.append(headerValue)
                else:
                    self.headerDictionary[(current)] = headerValue
                    possibleFields[headerValue]  = 1
                    current += 1
        self.columnCount = current
        #Check that all required fields exists
        missingHeaders = []
        for schema in csvSchema :
            if(possibleFields[FieldCleaner.cleanString(schema.name)] == 0) :
                missingHeaders.append(schema.name)
        if(len(missingHeaders) > 0 or len(duplicatedHeaders) > 0):
            # Write header errors if any occurred and raise a header_error exception
            errorString = ""
            with self.getWriter(bucketName, errorFilename, self.headerReportHeaders, self.isLocal) as writer:
                extraInfo = {}
                if(len(duplicatedHeaders) > 0):
                    errorString = "".join([errorString, "Duplicated: ",", ".join(duplicatedHeaders)])
                    extraInfo["duplicated_headers"] = ", ".join(duplicatedHeaders)
                    for header in duplicatedHeaders:
                        writer.write(["Duplicated header", header])
                if(len(missingHeaders) > 0):
                    if(len(duplicatedHeaders)):
                        # Separate missing and duplicated headers if both are present
                        errorString += "| "
                    errorString = "".join([errorString, "Missing: ",", ".join(missingHeaders)])
                    extraInfo["missing_headers"] = ", ".join(missingHeaders)
                    for header in missingHeaders:
                        writer.write(["Missing header", header])
                writer.finishBatch()
            raise ResponseException("Errors in header row: " + str(errorString), StatusCode.CLIENT_ERROR, ValueError,ValidationError.headerError,**extraInfo)
    def createTable(self, fileType, filename, jobId, tableName=None):
        """ Create staging table for new file
        Args:
        fileType -- type of file to create a table for (e.g. Award, AwardFinancial)

        Returns:
        tableName if created, exception otherwise
        """
        if(tableName == None):
            tableName = self.interface.getTableName(jobId)
        self.name = tableName

        if(self.interface.tableExists(tableName)):
            # Old table still present, drop table and replace
            self.interface.dropTable(tableName)

        # Alternate way of naming tables
        #tableName = "data" + tableName.replace("/","").replace("\\","").replace(".","")
        # Write tableName to related job in job tracker

        self.interfaces.jobDb.addStagingTable(jobId,tableName)
        fields = self.interfaces.validationDb.getFieldsByFile(fileType)

        """ Might not need sequence for ORM
        # Create sequence to be used for primary key
        sequenceName = tableName + "Serial"
        sequenceStatement = "CREATE SEQUENCE " + sequenceName + " START 1"
        try:
            self.runStatement(sequenceStatement)
        except ProgrammingError:
            # Sequence already exists
            pass
        """
        primaryAssigned = False
        # Create empty dict for field names and values
        classFieldDict = {"__tablename__":tableName}
        # Create dict to hold record for field names
        fieldNameMap = {}
        # Add each column
        for key in fields:
            # Build column statement for this key
            # Create cleaned version of key
            newKey = str(fields[key].file_column_id)
            # Get correct type name
            fieldTypeName = FieldCleaner.cleanString(fields[key].field_type.name)
            if(fieldTypeName == "string"):
                fieldTypeName = Text
            elif(fieldTypeName == "int"):
                fieldTypeName = Integer
            elif(fieldTypeName == "decimal"):
                fieldTypeName = Numeric
            elif(fieldTypeName == "boolean"):
                fieldTypeName = Boolean
            elif(fieldTypeName == "long"):
                fieldTypeName = BigInteger
            else:
                raise ValueError("Bad field type")
            # Get extra parameters (primary key or not null)
            extraParam = ""
            if(FieldCleaner.cleanString(fields[key].field_type.description) == "primary_key"):
                classFieldDict[newKey] = Column(fieldTypeName, primary_key=True)
                primaryAssigned = True
            elif(fields[key].required):
                classFieldDict[newKey] = Column(fieldTypeName, nullable=False)
            else:
                classFieldDict[newKey] = Column(fieldTypeName)
            # First record will hold field names
            fieldNameMap[str(newKey)] = str(key)
        # Add column for row number
        classFieldDict["row"] = Column(Integer, nullable=False)

        if(not primaryAssigned):
            # If no primary key assigned, add one based on table name
            classFieldDict["".join([tableName,"id"])] = Column(Integer, primary_key = True)


        # Create ORM class based on dict
        self.orm = type(tableName,(declarative_base(),),classFieldDict)
        self.jobId = jobId

        # Create table
        self.orm.__table__.create(self.interface.engine)

        # Add field name map to table
        self.interface.addFieldNameMap(tableName,fieldNameMap)
    def openFile(self,region,bucket,filename,csvSchema,bucketName,errorFilename):
        """ Opens file and prepares to read each record, mapping entries to specified column names
        Args:
            bucket : the S3 Bucket
            filename: The file path for the CSV file in S3
            writer: An implementation of csvAbstractWriter to send header errors to
        Returns:
        """


        possibleFields = {}
        currentFields = {}
        for schema in  csvSchema:
                possibleFields[FieldCleaner.cleanString(schema.name)] = 0

        self.filename = filename
        self.unprocessed = ''
        self.extraLine = False
        self.lines = []
        self.headerDictionary = {}
        self.packetCounter = 0
        current = 0
        self.isFinished = False
        self.columnCount = 0
        line = self._getLine()
        # make sure we have not finished reading the file

        if(self.isFinished) :
            raise ResponseException("CSV file must have a header",StatusCode.CLIENT_ERROR,ValueError,ValidationError.singleRow)

        duplicatedHeaders = []
        #create the header
        for row in csv.reader([line],dialect='excel'):
            for cell in row :
                headerValue = FieldCleaner.cleanString(cell)
                if( not headerValue in possibleFields) :
                    # Allow unexpected headers, just mark the header as None so we skip it when reading
                    self.headerDictionary[(current)] = None
                    current += 1
                elif(possibleFields[headerValue] == 1) :
                    # Add to duplicated header list
                    duplicatedHeaders.append(headerValue)
                else:
                    self.headerDictionary[(current)] = headerValue
                    possibleFields[headerValue]  = 1
                    current += 1
        self.columnCount = current
        #Check that all required fields exists
        missingHeaders = []
        for schema in csvSchema :
            if(schema.required and  possibleFields[FieldCleaner.cleanString(schema.name)] == 0) :
                missingHeaders.append(schema.name)
        if(len(missingHeaders) > 0 or len(duplicatedHeaders) > 0):
            # Write header errors if any occurred and raise a header_error exception

            with self.getWriter(bucketName, errorFilename, self.headerReportHeaders, self.isLocal) as writer:
                extraInfo = {}
                if(len(duplicatedHeaders) > 0):
                    extraInfo["duplicated_headers"] = ", ".join(duplicatedHeaders)
                    for header in duplicatedHeaders:
                        writer.write(["Duplicated header", header])
                if(len(missingHeaders) > 0):
                    extraInfo["missing_headers"] = ", ".join(missingHeaders)
                    for header in missingHeaders:
                        writer.write(["Missing header", header])
                writer.finishBatch()
            raise ResponseException("Errors in header row", StatusCode.CLIENT_ERROR, ValueError,ValidationError.headerError,**extraInfo)