Пример #1
0
 def _parse_value_with_corresponding_parser_(self, value, col):
     col_parser_name = 'parse_' + str(col)
     man_log.debug('parsing %s from %s using %s' % (col,
                                                    value, col_parser_name))
     col_parser = getattr(self, col_parser_name,
                          self.default_parser)
     return col_parser(value, col)
Пример #2
0
    def interview_date_write_formatter(self, dateobj, coldef):
        if isinstance(dateobj, self.NoDataError):
            return coldef.missing_vals
        if type(dateobj) == str:
            man_log.debug("date formatter catches a data string")
            return dateobj

        return dateobj.strftime('%m/%d/%Y')
Пример #3
0
    def interview_date_write_formatter(self, dateobj, coldef):
        if isinstance(dateobj, self.NoDataError):
            return coldef.missing_vals

        if type(dateobj) == str:
            man_log.debug("date formatter catches a data string")
            return dateobj

        return dateobj.strftime('%m/%d/%Y')
Пример #4
0
    def ensure_row(self, datarow):
        man_log.debug("ENSURING DATA ROW %s" % datarow)
        for coldef, elem in datarow.items():

            if coldef.required:
                man_log.debug('row[%s](%s) is required' % (coldef, elem))
                if isinstance(elem, self.NoDataError):
                    # import ipdb; ipdb.set_trace()
                    man_log.critical("\n\n\nRAISING DROPROW")
                    raise DropRowException('%s' % elem)
Пример #5
0
    def ensure_row(self, datarow):
        man_log.debug("ENSURING DATA ROW %s" % datarow)
        for coldef, elem in datarow.items():

            if coldef.required:
                man_log.debug('row[%s](%s) is required' % (coldef, elem))
                if isinstance(elem, self.NoDataError):
                    # import ipdb; ipdb.set_trace()
                    man_log.critical("\n\n\nRAISING DROPROW")
                    raise DropRowException('%s' % elem)
Пример #6
0
    def get_filepath(self,
                     save=False,
                     title='open file',
                     filetype='file',
                     quit=True,
                     allownew=True,
                     **kwargs):
        """this is a generic function that can be extended
            it simply gets a filepath and asserts it's not empty.
            if it's empty the program quits unless quit is False.
            when it will throw an error

            filetype is a string used for error messages and variable names

            askopenfilename takes other kwargs as well you can look into
            all of them provided get passed on.
            - defaultextension - str expression for default extensions
            - others check out utils.askopenfilename docs for more
            - initialdir - str path to where you would like to open
            TODO: figure out how to disallow new files being made/ allow
        """
        fpath = None
        # I have hard coded the file types to csv and tsv.
        if save:
            fpath = utils.asksaveasfilename(title=title,
                                            filetypes=(("csv files", "*.csv"),
                                                       ("all files", "*.*")),
                                            **kwargs)
        else:
            fpath = utils.askopenfilename(title=title,
                                          filetypes=(
                                              ("all files", "*.*"),
                                              ("tsv files", "*.tsv"),
                                              ("csv files", "*.csv"),
                                          ),
                                          **kwargs)

        # Check path validity
        if fpath == '' or len(fpath) == 0:
            print('no %s file selected. quitting' % filetype)
            utils.exit()

        setattr(self, filetype, fpath)
        man_log.debug('selected %s to be %s.' % (filetype, fpath))
        return fpath
Пример #7
0
    def get_filepath(self,
                     save=False,    # This flag is used to specify whether the file is for output or input
                     title='open file',
                     filetype='file',
                     quit=True,
                     allownew=True,
                     **kwargs) -> str:
        """this is a generic function that can be extended
            it simply gets a filepath and asserts it's not empty.
            if it's empty the program quits unless quit is False.
            when it will throw an error

            filetype is a string used for error messages and variable names

            askopenfilename takes other kwargs as well you can look into
            all of them provided get passed on.
            - defaultextension - str expression for default extensions
            - others check out utils.askopenfilename docs for more
            - initialdir - str path to where you would like to open
            - initialfile - str default filename

            TODO: figure out how to disallow new files being made/ allow
        """
        fpath = None
        # I have hard coded the file types to csv and tsv.
        if save:
            fpath = utils.asksaveasfilename(title=title, filetypes=(("csv files", "*.csv"),
                                                                    ("all files", "*.*")),
                                            **kwargs)
        else:
            fpath = utils.askopenfilename(title=title, filetypes=(("all files", "*.*"),
                                                            ("tsv files", "*.tsv"),
                                                          ("csv files", "*.csv"),),
                                          **kwargs)

        # Check path validity
        if fpath == '' or len(fpath) == 0:
            print('no %s file selected. quitting' % filetype)
            utils.exit()

        setattr(self, filetype, fpath)
        man_log.debug('selected %s to be %s.' % (filetype, fpath))
        return fpath
Пример #8
0
    def load_data(self, clear_src=True):
        ''' loads the source data file and stores it in self.data
            so that it can be iterated through easily

            NOTE. IF datafile lives on a server through a vpn. this runs
            REALLY REALLY SLOWLY...
            we should pull the file and use readlines to save them to
            memory and close the file to speed things up. rather than querying
            every time.
        '''
        man_log.info('Loading source data into %s' % type(self).__name__)
        # if we want to clear the src
        if clear_src: self.initialize_data()

        # get source data im memory and validate data
        data = self._read_data_from_source_()
        if data is not None:
            self.data = data
        else:
            man_log.debug('Data source corrupted.')
            raise Exception("Data source corrupted. Please check the data source")
Пример #9
0
    def _value_is_in_missing_list_(self, value, col_def):
        """
        Check whether the value is in the missing value of the col_def, if col_def has missing value list
        :param value: data value
        :param col_def:
        :return:
        """

        if hasattr(col_def, 'missing_vals'):
            man_log.debug('checking if %s in missing vals: %s' % (value,
                                                                  col_def.missing_vals))

            missing = [missing_val.strip() for missing_val in col_def.missing_vals.split(",")]

            if value in missing:
                return True
            else:
                return False

        man_log.debug("column: %s doesn't have missing vals" % col_def)
        return False
Пример #10
0
    def write_outfile(self):
        ''' writes self.data to a the outfile. which the user provides'''

        outpath, outfile = self.read_output_file()
        self.write_header(outfile)

        outwriter = utils.DictWriter(outfile,
                                     fieldnames=self.col_defs,
                                     delimiter=self.delimiter)
        outwriter.writeheader()
        #import ipdb; ipdb.set_trace()
        for rowid, row in enumerate(self.data):
            for coldef, elem in row.items():
                if isinstance(
                        elem,
                        ssManager.NoDataError):  # print the default value.
                    elem = ''
                formatter = getattr(self, coldef + '_write_formatter',
                                    self.default_write_formatter)

                man_log.debug('trying formatter %s' %
                              (coldef + '_write_formatter'))
                man_log.debug('formatting row[%s][%s](%s) with %s' %
                              (rowid, coldef, row[coldef], formatter.__name__))

                row[coldef] = formatter(row[coldef], coldef)
                man_log.debug('writing row[%s][%s] is %s' %
                              (rowid, coldef, row[coldef]))
            outwriter.writerow(row)

        return outpath
Пример #11
0
    def load_data(self, clear_src=True):
        ''' loads the source data file and stores it in self.data
            so that it can be iterated through easily

            NOTE. IF datafile lives on a server through a vpn. this runs
            REALLY REALLY SLOWLY...
            we should pull the file and use readlines to save them to
            memory and close the file to speed things up. rather than querying
            every time.
        '''
        man_log.info('Loading source data into %s' % type(self).__name__)
        # if we want to clear the src
        if clear_src: self.initialize_data()

        # get source data im memory and validate data
        data = self._read_data_from_source_()
        if data is not None:
            self.data = data
        else:
            man_log.debug('Data source corrupted.')
            raise Exception(
                "Data source corrupted. Please check the data source")
Пример #12
0
    def _value_is_in_missing_list_(self, value, col_def):
        """
        Check whether the value is in the missing value of the col_def, if col_def has missing value list
        :param value: data value
        :param col_def:
        :return:
        """

        if hasattr(col_def, 'missing_vals'):
            man_log.debug('checking if %s in missing vals: %s' %
                          (value, col_def.missing_vals))
            missing = [
                missing_val.strip()
                for missing_val in col_def.missing_vals.split(",")
            ]

            if value in missing:
                return True
            else:
                return False

        man_log.debug("column: %s doesn't have missing vals" % col_def)
        return False
Пример #13
0
    def _read_data_from_source_(self):
        '''
            The implementation of reading data from a local file. It uses a dialog to get data path

            Overridable: if anyone wants to get data from different source, override this function

        :return: An array of ordered dictionary that contains the data.
        '''
        data = []
        # open file
        srcfile = open(self.get_src_datpath(), errors='ignore')
        srcreader = utils.DictReader(srcfile, delimiter=self.delimiter)

        # assert the file has all the expected fields
        man_log.debug('expected fieldnames: %s' % self.col_defs)

        error_flag = False
        for index, col_name in enumerate(self.col_defs):
            if col_name not in srcreader.fieldnames:
                user_error_log.log_mapping_error(
                    col_name,
                    column_id=index + 1,
                    message="this field missing in data file")
                error_flag = True
                continue

        if error_flag:
            raise self.TemplateError(
                ('expected columns not '
                 'found in source datafile, with fields: %s') %
                (list(srcreader.fieldnames)))

        # load each row with each col's parser
        for rowid, datarow in enumerate(srcreader):
            man_log.info('loading row %s' % rowid)
            man_log.debug('parsing row %s : %s' % (rowid, datarow))

            row = utils.OrderedDict()
            for col in self.col_defs:
                try:
                    # the parser name is defined as "parse_" + col.col_name
                    # e.g, for source_col "subjectID", the parser will be "parse_subjectID"
                    row[col] = self._parse_value_with_corresponding_parser_(
                        datarow[col], col)

                except Exception as e:
                    man_log.debug('Exception while parsing %s: %s' % (col, e))
                    row[col] = self.NoDataError('%s' % e)

            data.append(row)

        return data
Пример #14
0
    def default_parser(self, value, coldef):
        ''' just a simple parser if no other is defined
            used when parsing the template?'''
        man_log.debug('parsing [%s] from (%s)' % (coldef, value))

        # Now the empty string will be regarded as empty
        if value == "" or self._value_is_in_missing_list_(value, coldef):
            # If the data is in missing vals, then a no data error will be return as the placeholder
            man_log.debug('replacing row[%s](%s) with NoData' % (coldef, value))
            return self.NoDataError(('value %s identified as a missing '
                                     'value for col %s') % (value, coldef))

        # 999 problem?
        man_log.debug('parse result is (%s)' % value)
        return str(value)
Пример #15
0
    def default_parser(self, value, coldef):
        ''' just a simple parser if no other is defined
            used when parsing the template?'''
        man_log.debug('parsing [%s] from (%s)' % (coldef, value))

        # Now the empty string will be regarded as empty
        if value == "" or self._value_is_in_missing_list_(value, coldef):
            # If the data is in missing vals, then a no data error will be return as the placeholder
            man_log.debug('replacing row[%s](%s) with NoData' %
                          (coldef, value))
            return self.NoDataError(('value %s identified as a missing '
                                     'value for col %s') % (value, coldef))

        # 999 problem?
        man_log.debug('parse result is (%s)' % value)
        return str(value)
Пример #16
0
    def write_sink_data_with_outwriter(self, data, outwriter: utils.DictWriter) -> None:
        for rowid, row in enumerate(data):
            for coldef, elem in row.items():

                # get formatter (especially for date output). If no customized formatter provided, then nothing.
                formatter = getattr(self, coldef + '_write_formatter',  # formatter is a function
                                    self.default_write_formatter)

                man_log.debug('trying formatter %s' % (
                        coldef + '_write_formatter'))
                man_log.debug('formatting row[%s][%s](%s) with %s' % (rowid,
                                                                      coldef, row[coldef], formatter.__name__))

                row[coldef] = formatter(row[coldef], coldef)
                man_log.debug('writing row[%s][%s] is %s' % (rowid, coldef,
                                                             row[coldef]))
            outwriter.writerow(row)

        return None
Пример #17
0
    def _read_data_from_source_(self):
        """
            The implementation of reading data from a local file. It uses a dialog to get data path

            Overridable: if anyone wants to get data from different source, override this function

        :return: An array of ordered dictionary that contains the data.
        """
        data = []
        # open file
        srcfile = open(self.get_src_datpath(), errors='ignore')
        srcreader = utils.DictReader(srcfile, delimiter=self.delimiter)

        # assert the file has all the expected fields
        man_log.debug('expected fieldnames: %s' % self.col_defs)

        # this will throw TemplateException
        self._check_whether_all_src_cols_in_src_fields_(src_cols=self.col_defs, fieldnames=list(srcreader.fieldnames))

        # load each row with each col's parser
        for rowid, datarow in enumerate(srcreader):
            man_log.info('loading row %s' % rowid)
            man_log.debug('parsing row %s : %s' % (rowid, datarow))

            row = utils.OrderedDict()
            for col in self.col_defs:
                try:
                    # the parser name is defined as "parse_" + col.col_name
                    # e.g, for source_col "subjectID", the parser will be "parse_subjectID"
                    row[col] = self._parse_value_with_corresponding_parser_(datarow[col], col)

                except Exception as e:
                    man_log.debug('Exception while parsing %s: %s' % (col, e))
                    row[col] = self.NoDataError('%s' % e)

            data.append(row)

        return data
Пример #18
0
    def _read_data_from_source_(self):
        '''
        Follow the api for read data
        :return:
        '''
        data = []

        con = pyodbc.connect("DSN=wtp_data")

        # To test the primary key. The primary key can be familyid and twin or just familyid.
        table_type = self.check_table_type(self.data_table[0], con)
        join_cmd = self.get_join_stmt(self.data_table, table_type)
        cursor = con.cursor()
        cursor.execute(join_cmd)
        desc = cursor.description
        fieldnames = self._get_fieldnames_(desc)

        #import ipdb;ipdb.set_trace();
        # assert the data source has all the source fields defined in the template
        # so that no col_defs will map to nothing in the data source
        man_log.debug('expected fieldnames: %s' % self.col_defs)
        for col_name in self.col_defs:
            if col_name not in fieldnames:
                raise self.TemplateError(
                    ('expected column %s not '
                     'found in source datafile, with fields: %s') %
                    (col_name, list(fieldnames)))

        sql_data = cursor.fetchall()
        # load each row
        for rowid, datarow in enumerate(sql_data):
            man_log.info('loading row %s' % rowid)
            man_log.debug('parsing row %s : %s' % (rowid, datarow))
            row = utils.OrderedDict()
            for col in self.col_defs:
                try:
                    # Find the data position due to the fact that you can only access the data in datarow
                    # with index
                    col_name = col.col_name
                    index = fieldnames.index(col_name)

                    # prepare parser
                    col_parser_name = 'parse_' + str(col)
                    man_log.debug('parsing %s from %s using %s' %
                                  (col, datarow[index], col_parser_name))
                    col_parser = getattr(self, col_parser_name,
                                         self.default_parser)

                    # The empty item in db will be translated into None in python.
                    # Thus, clean the None into ""
                    if str(datarow[index]) == "None":
                        datarow[index] = ""

                    # I parse everything into datarow
                    row[col] = col_parser(str(datarow[index]), col)

                except Exception as e:
                    man_log.debug('Exception while parsing %s: %s' % (col, e))
                    row[col] = self.NoDataError('%s' % e)
            data.append(row)
        con.close()
        return data
Пример #19
0
 def _parse_value_with_corresponding_parser_(self, value, col):
     col_parser_name = 'parse_' + str(col)
     man_log.debug('parsing %s from %s using %s' %
                   (col, value, col_parser_name))
     col_parser = getattr(self, col_parser_name, self.default_parser)
     return col_parser(value, col)
Пример #20
0
    def _read_data_from_source_(self) -> List[utils.OrderedDict]:
        """
            Follow the api for read data.
            This overrided method will connect to tables in wtp_data based on the data table specified in
            the rocket template, and then convert the data records into a list of orderedDict, as the data source
            in source manager.
        :return:
        """
        data = []

        con = pyodbc.connect("DSN=wtp_data")

        # To test the primary key. The primary key can be familyid and twin or just familyid.
        table_type = self.check_table_type(self.data_table[0], con) # type: TableType
        # table_type only matters with the join statement
        join_cmd = self.get_join_stmt(self.data_table, table_type)

        cursor = con.cursor()
        cursor.execute(join_cmd)
        desc = cursor.description
        fieldnames = self._get_fieldnames_(desc)

        # import ipdb;ipdb.set_trace();
        # assert the data source has all the source fields defined in the template
        # so that no col_defs will map to nothing in the data source
        # man_log.debug('expected fieldnames: %s' % self.col_defs)
        # this will throw exception
        self._check_whether_all_src_cols_in_src_fields_(self.col_defs, fieldnames)

        sql_data = cursor.fetchall()
        # load each row
        for rowid, datarow in enumerate(sql_data):
            man_log.info('loading row %s' % rowid)
            man_log.debug('parsing row %s : %s' % (rowid, datarow))
            row = utils.OrderedDict()
            for col in self.col_defs:
                try:
                    # Find the data position due to the fact that you can only access the data in datarow
                    # with index
                    col_name = col.col_name
                    index = fieldnames.index(col_name)

                    # prepare parser
                    col_parser_name = 'parse_' + str(col)
                    man_log.debug('parsing %s from %s using %s' % (col,
                                                                   datarow[index], col_parser_name))
                    col_parser = getattr(self, col_parser_name,
                                         self.default_parser)

                    # The empty item in db will be translated into None in python.
                    # Thus, clean the None into ""
                    if str(datarow[index]) == "None":
                        datarow[index] = ""

                    # I parse everything into datarow
                    row[col] = col_parser(str(datarow[index]), col)

                except Exception as e:
                    man_log.debug('Exception while parsing %s: %s' % (col, e))
                    row[col] = self.NoDataError('%s' % e)
            data.append(row)
        con.close()
        return data