Exemplo n.º 1
0
 def from_disc(cls,
               csvpath,
               dict_schema,
               schema_type='qc',
               id_column=1,
               threshold=3):
     """
     Constucts a TableReport from a csvfile and a given schema.
     Arguments:
     :param csvpath: string, the csv filepath
     :param schema: dictionary describing the csv schema
     :param schema_type: 'qc' for frictionless type, 'dc' for Data Catalogue type json schema
     :param id_column: column number of dataset's primary key (id)
     :param threshold: outlier threshold - (mean - threshold * std, mean + threshold * std) 
                       outside this length, a numerical value is considered outlier
     """
     if schema_type == 'qc':
         dataset_schema = QcSchema(dict_schema)
     elif schema_type == 'dc':
         LOGGER.info(
             'Transating from Data Catalogue to Frictionless json format...'
         )
         qcdict_schema = FrictionlessFromDC(dict_schema).qcdescriptor
         dataset_schema = QcSchema(qcdict_schema)
     dataset = QcTable(csvpath, schema=dataset_schema)
     return cls(dataset, id_column=id_column, threshold=threshold)
 def reorganizefiles(self, output):
     """reorganize the dcm files in a folder structure for
     LORIS import pipeline.
     Arguments:
     :param output: output folder
     """
     LOGGER.info('Reorganizing files for LORIS pipeline into folder: %s' %
                 output)
     for patient in self.patients:
         patientid = patient.patientid
         patdir = os.path.join(output, patientid)
         if not os.path.exists(patdir):
             os.mkdir(patdir)
         study_count = 0
         for study in patient.studies:
             study_count += 1
             d = [patientid, str(study_count)]
             studydir = os.path.join(patdir, '_'.join(d))
             if not os.path.exists(studydir):
                 os.mkdir(studydir)
             for seq in study.sequences:
                 for dicom in seq.dicoms:
                     sourcepath = dicom.filepath
                     destpath = os.path.join(studydir, dicom.filename)
                     shutil.copy(sourcepath, destpath)
Exemplo n.º 3
0
 def suggest_cde(self, columnreport, threshold=0.6):
     """Suggests the most similar CDE for the column.
     Arguments:
     :param columnreport: ColumnReport object with info of a datset column
     :param threshold: 0-1 similarity threshold, below that not a cde is suggested
     :returns: a CdeVariable object
     """
     name = columnreport.name
     val_range = columnreport.value_range
     mip_type = columnreport.miptype
     LOGGER.debug('The incoming column name is: {}'.format(name))
     # select cdes with tha same type and calculate similarity
     canditates = [
         cde for cde in self.__cdes.values() if cde.miptype == mip_type
     ]
     LOGGER.debug('Number of cdes with miptype {} is: {}'.format(
         mip_type, len(canditates)))
     if canditates:
         canditates.sort(key=lambda x: x.similarity(name, val_range),
                         reverse=True)
         canditate = canditates[0]
         similarity = canditate.similarity(name, val_range)
         LOGGER.debug(
             'The simirarity between "{}" and cde "{}" is: {}'.format(
                 name, canditate.code, similarity))
         if similarity >= threshold:
             return canditate
         else:
             LOGGER.info(
                 'No cde match found for incoming column "{}"'.format(name))
             return None
     else:
         LOGGER.info(
             'No cde match found for incoming column "{}"'.format(name))
         return None
    def __init__(self, rootfolder, username):
        """ Arguments:
            :param rootfolder: folder path with DICOMs subfolders
            :param username: str with the username
            """
        start_time = time.time()
        self.reportdata = None
        self.rootfolder = rootfolder
        self.subfolders = getsubfolders(rootfolder)
        self.username = username
        self.dataset = {
            'version': __version__,
            'date_qc_ran':
            datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S'),
            'username': username,
            'dicomfolder': str(os.path.abspath(rootfolder))
        }
        # files that are not DICOMS (.dcm)
        # list of (folder, filename)
        self.__notprocessed = []
        # MRISequences objects
        self.__invalidseq = []
        # MRIPatient objects
        self.__patients = []

        # statistic info
        self.__totalvalidseq = 0
        self.__totalstudies = 0
        self.__totalinvaliddicoms = 0
        self.__seriesdescriptions_valid = set()
        self.__seriesdescriptions_invalid = set()
        self.__seqperpatient_distr = {
            's1': 0,
            's2': 0,
            's3-s5': 0,
            's6more': 0,
        }
        self.__patientid_with_invalids = set()

        # Read all the dcm files and calc QC stats
        self.__readicoms_parallel(mp.cpu_count() + 1)
        self.__collect_stats()

        LOGGER.debug('Dicom analysis running time: %s seconds' %
                     (time.time() - start_time))
        LOGGER.debug('Folders read: %i' % len(self.subfolders))
        LOGGER.debug('Total MRI sequences found: %i' %
                     (self.totalvalidsequences + self.totalinvalidsequences))
        LOGGER.info('Patients with good seq: %i' % self.totalpatients)
        LOGGER.info('Total visits: %i' % self.totalvisits)
        LOGGER.info('Good seq: %i' % self.totalvalidsequences)
        LOGGER.info('Bad seq: %i' % self.totalinvalidsequences)
        LOGGER.info('Files not processed: %i' % self.totalbadfiles)
Exemplo n.º 5
0
    def save_schema(self):
        self.save_button.config(state='disabled')

        if self.schema_output.get() == 1:
            output_file = tkfiledialog.asksaveasfilename(title='enter file name',
                                                        filetypes=(('excel files', '*.xlsx'),
                                                                    ('all files', '*.*')))
        else:
            output_file = tkfiledialog.asksaveasfilename(title='enter file name',
                                                        filetypes=(('json files', '*.json'),
                                                                    ('all files', '*.*')))

        if output_file:
            warningtitle = 'Can not save the schema'
            if not self.dname:
                tkmessagebox.showwarning(warningtitle,
                                         'Please, select dataset file')
            max_categories = int(self.inf_opt_frame.max_categories.get())
            sample_rows = int(self.inf_opt_frame.sample_rows.get())
            na_empty_strings_only = self.inf_opt_frame.na_empty_strings_only.get()
            if self.inf_opt_frame.cde_dict:
                infer = InferSchema.from_disc(self.datasetpath, 
                                              sample_rows=sample_rows,
                                              maxlevels=max_categories,
                                              cdedict=self.inf_opt_frame.cde_dict,
                                              na_empty_strings_only=na_empty_strings_only)
                if self.inf_opt_frame.thresholdstring.get() == '':
                    threshold = 0.6
                else:
                    threshold = float(self.inf_opt_frame.thresholdstring.get())
                LOGGER.info('CDE similarity threshold: %f' % threshold)
                infer.suggest_cdes(threshold=threshold)
                infer.export2excel(output_file)
                LOGGER.info('Schema file has been created successully')
                tkmessagebox.showinfo(
                        title='Status info',
                        message='Schema file has been created successully'
                    )
  
            else: 
                infer = InferSchema.from_disc(self.datasetpath, 
                                              sample_rows=sample_rows,
                                              maxlevels=max_categories,
                                              cdedict=None,
                                              na_empty_strings_only=na_empty_strings_only)
                if self.schema_output.get() == 1:
                    infer.export2excel(output_file)
                    LOGGER.info('Schema file has been created successully')
                    tkmessagebox.showinfo(
                        title='Status info',
                        message='Schema file has been created successully'
                    )
                else:
                    infer.expoct2qcjson(output_file)
                    LOGGER.info('Schema file has been created successully')
                    tkmessagebox.showinfo(
                        title='Status info',
                        message='Schema file has been created successully'
                    )
        self.save_button.config(state='normal')
Exemplo n.º 6
0
 def add_replacement_expr(self):
     reps = self.func_replace_trg_listbox.get(0, tk.END)
     replacemnts = []
     source_col = self.parent.csv_name.replace(
         ".csv", "") + '.' + self.selected_column
     LOGGER.debug('the repleacements are: {}'.format(reps))
     if len(reps) > 0:
         self.expressions_text.delete("1.0", "end-1c")
         for rep in reps:
             s = rep.split('->')
             replacemnts.append(Replacement(s[0], s[1]))
         expr = ifstr(source_col, replacemnts)
         LOGGER.info('the expression is: {}'.format(expr))
         self.expressions_text.insert('1.0', expr)
Exemplo n.º 7
0
 def suggest_corrs(self):
     self.suggest_btn.config(state='disabled')
     warningtitle = 'Could not make suggestions'
     if self.infer_opt_frame.cde_dict:
         if self.infer_opt_frame.thresholdstring.get() == '':
             threshold = 0.6
         else:
             threshold = float(self.infer_opt_frame.thresholdstring.get())
         
         self.cdemapper.suggest_corr(self.infer_opt_frame.cde_dict,
                                     threshold=threshold)
         LOGGER.info('Done with the correspondences suggestions..Updating listbox...')
         self.update_listbox_corr()
     else:
         tkmessagebox.showwarning(warningtitle,
                                  'Could not find the CDE dictionary file')
     self.suggest_btn.config(state='normal')
 def __readicoms_parallel(self, processes):
     """Read all the dicoms using multiprocessing."""
     output = []
     if len(self.subfolders) > processes:
         LOGGER.info('dicom parallel processing with {} Processes'.format(
             processes))
         slices = list(splitdict(self.subfolders, processes))
         with Pool(processes) as p:
             output = p.map(self.readicoms_chunks, slices)
         for chunk in output:
             self.__patients += chunk['patients']
             self.__invalidseq += chunk['invalidseq']
             self.__notprocessed += chunk['notprocessed']
     else:
         LOGGER.info('Single core processing...')
         output = self.readicoms_chunks(self.subfolders)
         self.__patients += output['patients']
         self.__invalidseq += output['invalidseq']
         self.__notprocessed += output['notprocessed']
Exemplo n.º 9
0
    def __create_db_container(self):
        """Creates a postgres 9.6 container.
        """
        self.__check_db_container(mode='running')
        self.__check_db_container(mode='exist')

        if self.__is_db_running:
            LOGGER.info('db container ({}) is already up and'
                        ' running. Skipping creation step...'.format(
                            self.__db_cont_name))
            self.__remove_create_db()
            pass
        elif self.__is_db_exist and not self.__is_db_running:
            LOGGER.info('db container({}) already exists. '
                        'Restarting db container'.format(self.__db_cont_name))
            subprocess.run(['docker', 'restart', self.__db_cont_name])
            time.sleep(10)
            self.__remove_create_db()

        else:
            # create the db container
            LOGGER.debug('Creating db container with name {}'.format(
                self.__db_cont_name))
            arg_port = ['-p', '{}:5432'.format(self.__dbport)]
            arg_name = ['--name', self.__db_cont_name]
            arg_env1 = ['-e', 'POSTGRES_PASSWORD={}'.format(self.__dbpassword)]
            arg_env2 = ['-e', 'POSTGRES_USER={}'.format(self.__dbuser)]
            arg_img = ['-d', self.__db_image]
            command2 = ['docker', 'run'
                        ] + arg_port + arg_name + arg_env1 + arg_env2 + arg_img
            try:
                createproc = subprocess.run(command2)
                time.sleep(50)
                self.__remove_create_db()
            except subprocess.CalledProcessError:
                LOGGER.warning(
                    'There was an error while attempting creating the db container.'
                )
                raise DockerExecError(
                    'There was an error while attempting creating the db container.'
                )
Exemplo n.º 10
0
 def get_all_cdes(self):
     LOGGER.info(
         'Trying to retrive cde metadata from Data Cataloge. Using DC url: {}'
         .format(DC_DOMAIN))
     all_pathologies_url = ''.join([DC_DOMAIN, DC_SUBDOMAIN_ALLPATHOLOGIES])
     r = requests.get(all_pathologies_url)
     self.dc = DcConnector(r)
     if self.dc.status_code == 200:
         self.dc_combox1.config(values=self.dc.pathology_names)
     elif 500 <= self.dc.status_code <= 599:
         LOGGER.info('Data Cataloge server internal error.')
     elif 400 <= self.dc.status_code <= 499:
         LOGGER.info(
             'Data Cataloge could not be reach!. Please check DC_DOMAIN in config url'
         )
Exemplo n.º 11
0
def csv(input_csv, schema_json, clean, metadata, report, outlier):
    """This command produces a validation report for <csv file>.

    The report file is stored in the same folder where <csv file> is located.
    
    <schema json> file MUST be compliant with frirctionless
      data table-schema specs(https://specs.frictionlessdata.io/table-schema/) or
      with Data Catalogue json format.
    """
    filename = os.path.basename(input_csv)
    # Get the path of the csv file
    path = os.path.dirname(os.path.abspath(input_csv))

    dataset_name = os.path.splitext(filename)[0]
    pdfreportfile = os.path.join(path, dataset_name + '_report.pdf')
    xlsxreportfile = os.path.join(path, dataset_name + '_report.xlsx')
    correctedcsvfile = os.path.join(path, dataset_name + '_corrected.csv')

    # read the json file with the csv schema
    with open(schema_json) as json_file:
        dict_schema = json.load(json_file)

    # check metadata json type is Data Catalogue specs
    if metadata == 'dc':
        LOGGER.info(
            'Transating from Data Catalogue to Frictionless json format...')
        dict_schema = FrictionlessFromDC(dict_schema).qcdescriptor

    schema = QcSchema(dict_schema)
    dataset = QcTable(input_csv, schema=schema)

    datasetreport = TableReport(dataset, threshold=outlier)

    # Apply data cleaning corrections?
    if clean:
        datasetreport.apply_corrections()
        datasetreport.save_corrected(correctedcsvfile)

    if datasetreport.isvalid:
        LOGGER.info('The dataset has is valid.')
    else:
        LOGGER.info('CAUTION! The dataset is invalid!')

    # export the report
    if report == 'pdf':
        datasetreport.printpdf(pdfreportfile)
    elif report == 'xls':
        datasetreport.printexcel(xlsxreportfile)
Exemplo n.º 12
0
 def extractSColumnsFunctions(expr, ddlFunctions, ddlColumns):
     try:
         CorrespondenceParser.firstCheckParentheses(
             expr)  #If it fails it will raise an Exception
     except SyntaxError as se:
         LOGGER.info(str(se))
         raise ExpressionError(str(se))
     try:
         columns = CorrespondenceParser.extractColumnsList(expr, ddlColumns)
     except ColumnNameError as cne:
         LOGGER.info(str(cne))
         raise ExpressionError(str(cne))
     try:
         CorrespondenceParser.extractSColumnsFunctionsR(
             expr, ddlFunctions, expr)
     except FunctionNameError as fne:
         LOGGER.info(str(fne))
         raise ExpressionError(str(fne))
     except ArgsFunctionError as afe:
         LOGGER.info(str(afe))
         raise ExpressionError(str(afe))
     #return columns, functions
     return columns
Exemplo n.º 13
0
    def __run_mapping(self):
        env = {}
        env['mipmap_map'] = self.__mapping
        env['mipmap_source'] = self.__source
        env['mipmap_output'] = self.__output
        env['mipmap_pgproperties'] = self.__dbprop
        env['mipmap_script'] = self.__scriptpath
        env['mipmap_target'] = self.__target
        env['mipmap_db'] = MIPMAP_DB_CONTAINER
        #LOGGER.debug(os.getenv('mipmap_pgproperties'))
        self.__template.stream(env).dump(self.__dcompose)
        if self.__is_mipmap_container_exist:
            LOGGER.info('Removing previous mipmap container...')
            remove_proc = subprocess.run(['docker', 'rm', self.__name])

        arguments = ['docker-compose', '-f', self.__dcompose, 'up', 'mipmap_etl']

        process = subprocess.Popen(arguments, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        LOGGER.info('Running docker-compose...')
        output, _ = process.communicate()
        LOGGER.info(output.decode('utf-8'))
Exemplo n.º 14
0
    def infer(self, rows, headers=1,
              confidence=0.75, maxlevels=10,
              na_empty_strings_only=False):
        # Get headers
        if isinstance(headers, int):
            headers_row = headers
            while True:
                headers_row -= 1
                headers = rows.pop(0)
                if not headers_row:
                    break
        elif not isinstance(headers, list):
            headers = []

        # Get descriptor
        guesser = _QcTypeGuesser()
        resolver = _QcTypeResolver()
        descriptor = {'fields': []}
        type_matches = {}
        unique_values = {}
        missingvalues = set()
        for header in headers:
            descriptor['fields'].append({'name': header})
        LOGGER.info('{} of sample rows are used for table schema inference'.format(len(rows)))
        for row_number, row in enumerate(rows):
            # Normalize rows with invalid dimensions for sanity
            row_length = len(row)
            headers_length = len(headers)
            if row_length > headers_length:
                row = row[:len(headers)]
            if row_length < headers_length:
                diff = headers_length - row_length
                fill = [''] * diff
                row = row + fill
            # build a column-wise lookup of type matches
            for index, value in enumerate(row):
                # remove leading and trailing whitespacing
                value = value.strip()
                rv = guesser.infer(value, na_empty_strings_only=na_empty_strings_only)
                name = rv[0]
                pattern = rv[1]
                # collect unique values for possible nominal variable
                if pattern == 'text' or name == 'integer':
                    if unique_values.get(index):
                        unique_values[index].add(value)
                    else:
                        unique_values[index] = set()
                        unique_values[index].add(value)
                # collect the nans
                elif pattern == 'nan':
                    missingvalues.add(value)
                if type_matches.get(index):
                    type_matches[index].append(rv)
                else:
                    type_matches[index] = [rv]
        # choose a type/format for each column based on the matches
        for index, results in type_matches.items():
            uniques = unique_values.get(index)
            rv = resolver.get(results, uniques, maxlevels, confidence)
            descriptor['fields'][index].update(**rv)
        # case missing values have been found
        if len(missingvalues) > 0:
            # add the default missing value in any case
            missingvalues.update(set(config.DEFAULT_MISSING_VALUES))
            # sort missing values
            missing_sorted = list(missingvalues)
            missing_sorted.sort()
            # update the missing values
            descriptor['missingValues'] = list(missing_sorted)
        # case missing values not found use default
        elif len(missingvalues) == 0:
            descriptor['missingValues'] = config.DEFAULT_MISSING_VALUES

        # Save descriptor
        self._Schema__current_descriptor = descriptor
        self.__build()
        self.__infered = True

        return descriptor
Exemplo n.º 15
0
 def on_select_version(self, event):
     LOGGER.info('Retrieving metadata json')
     if self.dc and self.dc.status_code == 200:
         self.dc_json = self.dc.getjson(self.selected_pathology.get(),
                                        self.selected_version.get())
Exemplo n.º 16
0
    def createreport(self):
        self.button_exec.config(state='disabled')
        LOGGER.info('Checking if the necessary fields are filled in...')
        warningtitle = 'Cannot create report'
        if not self.dname:
            tkmessagebox.showwarning(warningtitle,
                                     'Please, select dataset file')
        #elif not self.d_headers_cbox.get():
        #    tkmessagebox.showwarning(warningtitle,
        #                             'Please, select ColumnID')
        elif self.md_frame.from_disk.get() and not self.md_frame.metafilepath:
            tkmessagebox.showwarning(warningtitle,
                                     'Please, select metadata file')
        elif self.md_frame.from_dc.get() and not self.md_frame.dc_json:
            tkmessagebox.showwarning(warningtitle,
                                     'Could not get metadata from Data Cataloge')
        elif not self.__reportfilepath:
            tkmessagebox.showwarning(warningtitle,
                                     'Please, select report file first')
        else:
            try:
                threshold = float(self.outlier_threshold.get())
                LOGGER.info('Outlier threshold: %s' % self.outlier_threshold.get())
            except ValueError:
                LOGGER.warning('Could not retrieve outlier threshold. \
                                Setting it to default value: 3')
                threshold = 3
            LOGGER.info('Everything looks ok...')
            #filedir = self.__exportfiledir
            #basename = os.path.splitext(self.dname)[0]
            #pdfreportfile = os.path.join(filedir, basename + '_report.pdf')
            #xlsxreportfile = os.path.join(filedir, basename + '_report.xlsx')
            schema_type = 'qc'

            if self.md_frame.from_disk.get():
                LOGGER.info('Retrieving Metadata from localdisk...')
                LOGGER.info('Using metadata file: %s' % self.md_frame.metafilepath)
                with open(self.md_frame.metafilepath) as json_file:
                    dict_schema = json.load(json_file)
                if self.md_frame.json_type.get() == 2:
                    schema_type = 'dc'

            elif self.md_frame.from_dc.get():
                LOGGER.info('Retrieving Metadata from Data Catalogue...')
                LOGGER.info('Selected pathology is {}, CDE version: {}'.format(
                    self.md_frame.selected_pathology.get(),                                              
                    self.md_frame.selected_version.get())
                )             
                dict_schema = self.md_frame.dc_json
                schema_type = 'dc'

            try:
                self.reportcsv = TableReport.from_disc(self.datasetpath,
                                                       dict_schema=dict_schema,
                                                       schema_type=schema_type,                                                      
                                                       threshold=threshold)#id_column=self.d_headers_cbox.current())
                if self.reportcsv.isvalid:
                    LOGGER.info('The dataset is valid.')
                else:
                    LOGGER.info('CAUTION! The dataset is invalid!')

                # Perform Data Cleaning?
                #if self.cleaning.get():
                 #   self.reportcsv.apply_corrections()

                    #self.reportcsv.save_corrected(correctedcsvfile)

                # Create the  report
                if self.report_type.get() == 1:
                    self.reportcsv.printexcel(self.__reportfilepath)
                else:
                    self.reportcsv.printpdf(self.__reportfilepath)

                #self.label_export2.config(text=filedir)
                tkmessagebox.showinfo(
                    title='Status info',
                    message='Reports have been created successully'
                )

                self.show_sugg_button.config(state='normal')
                self.clean_button.config(state='normal')

            except QCToolException as e:
                errortitle = 'Something went wrong!'
                tkmessagebox.showerror(errortitle, e)
        self.button_exec.config(state='normal')
 def __init__(self, dcjson):
     # generates the tree structure of the loaded DC json
     LOGGER.info('Finding variable tree...')
     self.rootnode = Node(dcjson)
     self.__dc2qc()