def check_number_of_rows(self, rows):
     excel_max_rows = 1048576
     if rows > excel_max_rows:
         self.log.critical(pc.get_error_msg('error', 'too_big'))
         exit()
     if rows > 0.8 * excel_max_rows:
         self.log.warning(pc.get_error_msg('warning', 'near_limit'))
     return
    def do_chi2_test(self, peczely_table, front_table):
        # calculates the chi^2 test results, and saves them in it's memory
        warning = ''
        self.p_values = []
        # test for at least 5 (or 3) is 70% percent of the cells
        # 13*2*0.3=7.8
        if ((peczely_table < 5).sum().sum()) > 7:
            self.log.warning(
                pc.get_error_msg('warning', 'under_limit').format('Péczely'))
            warning += 'P'
        # Péczely
        excepted = pd.DataFrame(data=0,
                                index=range(len(peczely_table.index)),
                                columns=range(len(peczely_table.columns)))
        for i in range(13):
            excepted.iloc[i, :] = ((peczely_table.sum(axis=1).values[i]) /
                                   (self.maxdate - self.mindate).days)
            excepted.iloc[i, 0] = excepted.iloc[i, 0] * peczely_table.sum(
                axis=0).values[0]
            excepted.iloc[i, 1] = excepted.iloc[i, 1] * peczely_table.sum(
                axis=0).values[1]
        # print('Péczely p value:')
        # ddof=13, because scipy.stats.chisquare is a bit f****d up
        res = scs.chisquare(peczely_table, excepted, axis=None, ddof=13)
        # print(res)
        self.p_values.append(res)

        # Front
        # 9*2*0.3=5.4
        if (front_table < 5).sum().sum() > 5:
            self.log.warning(
                pc.get_error_msg('warning', 'under_limit').format('Front'))
            warning += 'F'
        excepted = pd.DataFrame(data=0,
                                index=range(len(front_table.index)),
                                columns=range(len(front_table.columns)))
        for i in range(9):
            excepted.iloc[i, :] = ((front_table.sum(axis=1).values[i]) /
                                   (self.maxdate - self.mindate).days)
            excepted.iloc[i, 0] = excepted.iloc[i, 0] * front_table.sum(
                axis=0).values[0]
            excepted.iloc[i, 1] = excepted.iloc[i, 1] * front_table.sum(
                axis=0).values[1]
        # print('p value:')
        # ddof=9, because scipy.stats.chisquare is a bit f****d up
        res = scs.chisquare(front_table, excepted, axis=None, ddof=9)
        # print(res)
        self.p_values.append(res)
    def __init__(self, filename, excel=True, sheet_name=0):
        self.log = logging.getLogger(__name__)
        self.log.debug("DataManipulator created")

        self.df = pd.DataFrame()
        self.excel = excel
        if excel:
            try:
                xlsx_file = pd.ExcelFile(filename)
                self.df = xlsx_file.parse(sheet_name, header=0)
                xlsx_file.close()
            except ImportError:
                self.log.critical(
                    pc.get_error_msg('error', 'xlrd not imported'))

            self.column_names = self.df.columns.values.tolist()[1:]
        else:
            self.ifile = open(filename, "r", encoding='utf_8')
            self.ofile = open("peczely_hozzarendelt.tmp",
                              "w",
                              encoding='utf_8')
            self.column_names = []
        self.maxdate = pd.Timestamp.min  # =    1677-09-21
        self.mindate = pd.Timestamp.max  # =    2262-04-11
        self.pivot_generator = pd.DataFrame()
        self.number_of_clusters = 0
        self.size_of_dimensions = []
        self.dims_can_be_checked = False
        self.dates = pd.DataFrame()
        self.p_values = []
Пример #4
0
 def check_errors(self):
     if not (self.output_file.endswith('.xlsx')):
         if self.exit_on_warning:
             self.log.critical(pc.get_error_msg('error', 'extension_error') + self.output_file)
             exit(-1)
         else:
             self.output_file += '.xlsx'
             self.log.error(pc.get_error_msg('warning', 'file_renamed') + self.output_file)
     if self.input_sheet_name == 0:  # old: ==0
         self.log.warning(pc.get_error_msg('warning', 'no_sheet_name'))
     try:
         writer = datamanipulator.pd.ExcelWriter(self.output_directory + '/' + self.output_file)
         datamanipulator.pd.DataFrame(data='Testing of write permission is completed, access granted',
                                      index=range(2),
                                      columns=range(2)).to_excel(writer, index=False, sheet_name='testing IO error')
         writer.save()
         writer.close()
     except PermissionError as pe:
         self.log.critical(pe)
         self.log.critical(pc.get_error_msg('error', 'permission_error'))
         exit(-2)
     if not self.logfile_name.endswith('.txt'):
         self.log.warning(pc.get_error_msg('warning', 'bad_logfile_extension'))
     if not self.chi_filename.endswith('.txt'):
         self.log.warning(pc.get_error_msg('warning', 'bad_chi_file_extension'))
     self.log.debug('First round error checks are completed (UI.check_errors)')
 def check_dims(self):
     a = len(self.size_of_dimensions)
     if a != self.number_of_clusters:
         try:
             self.log.debug(self.df.columns.values)
         except UnicodeEncodeError:
             self.log.debug('failed to print self.df.columns.values')
         self.log.critical(
             pc.get_error_msg('error', 'dimension_mismatch').format(
                 self.number_of_clusters, len(self.size_of_dimensions)))
         return False
     else:
         return True
    def create_generator_table_old(self):
        # error check
        clus = 1
        for i in range(self.number_of_clusters):
            clus = clus * self.size_of_dimensions[i]
        rows = (13 + 9) * 5 * clus
        self.check_number_of_rows(rows)
        if rows > 1000000:
            self.log.warning(pc.get_error_msg('warning', 'near_limit'))

        # generate
        peczely_table = self.create_generator_table('Peczely', 13, 2,
                                                    count_peczely)
        front_table = self.create_generator_table('Front', 9, 2, count_front)
        self.pivot_generator = pd.concat([peczely_table, front_table],
                                         copy=False)
    def assign_numbers(self):
        self.log.debug("assigning meteorological numbers to dates started")
        self.number_of_clusters = self.df.shape[1] - 1
        # error check
        if self.dims_can_be_checked:
            if not self.check_dims():
                exit()
        else:
            self.dims_can_be_checked = True

        row_number = 0
        pFAssign = PFAssign()
        meteo_df = pd.DataFrame(columns=[
            'P-2', 'P-1', 'P0', 'P1', 'P2', 'F-2', 'F-1', 'F0', 'F1', 'F2'
        ])
        for index, row in self.df.iterrows():
            # print(row[0], row[1], row[2])
            date = row[0]  # it's a Timestamp format
            try:
                meteo_df.loc[row_number] = pFAssign.return_date(
                    round(to_excel_date(date)))
            except TypeError:
                self.log.critical(pc.get_error_msg('error', 'format'))
                exit()

            # min max
            if date > self.maxdate:
                self.maxdate = date
            if date < self.mindate:
                self.mindate = date
            row_number += 1

        self.check_ascending()
        self.df = self.df.join(meteo_df)
        self.log.info("assigning meteorological numbers to dates ended")
        self.log.debug('min date: ' + str(self.mindate))
        self.log.debug('max date: ' + str(self.maxdate))
 def check_ascending(self):
     if self.mindate != self.df.iloc[0,
                                     0] or self.maxdate != self.df.iloc[-1,
                                                                        0]:
         self.log.critical(pc.get_error_msg('error', 'not_sorted'))
         exit()