Python read_investigation_file 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: isatools.isatab

메소드/함수: read_investigation_file

hotexamples.com에서의 예제들: 3

Python read_investigation_file - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 isatools.isatab.read_investigation_file에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: utils.py 프로젝트: enanomapper/isa-api

def assert_tab_content_equal(fp_x, fp_y):
    """
    Test for equality of tab files, only down to level of content - should not be taken as canonical equality, but
    rather that all the expected content matches to both input files, but not the order in which they appear.

    For more precise equality, you will need to apply a configuration
        - use assert_tab_equal_by_config(fp_x, fp_y, config)
    :param fp_x: File descriptor of a ISAtab file
    :param fp_y: File descriptor of another  ISAtab file
    :return: True or False plus any AssertionErrors
    """

    def _assert_df_equal(x, y):  # need to sort values to loosen up how equality is calculated
        try:
            assert_frame_equal(x.sort_values(by=x.columns[0]), y.sort_values(by=y.columns[0]))
            return True
        except AssertionError as e:
            print(e)
            return False

    from os.path import basename

    if basename(fp_x.name).startswith("i_"):
        df_dict_x = read_investigation_file(fp_x)
        df_dict_y = read_investigation_file(fp_y)
        eq = True
        for k in df_dict_x.keys():
            dfx = df_dict_x[k]
            dfy = df_dict_y[k]
            if not isinstance(dfx, list):
                if not _assert_df_equal(dfx, dfy):
                    eq = False
                    break
            else:
                try:
                    for x, y in zip(sorted(dfx), sorted(dfy)):
                        if not _assert_df_equal(x, y):
                            eq = False
                            break
                except ValueError as e:
                    print(e)
        return eq
    else:

        def diff(a, b):
            b = set(b)
            return [aa for aa in a if aa not in b]

        import numpy as np

        df_x = pd.read_csv(fp_x, sep="\t", encoding="utf-8")
        df_y = pd.read_csv(fp_y, sep="\t", encoding="utf-8")
        try:
            # drop empty columns
            df_x = df_x.replace("", np.nan)
            df_x = df_x.dropna(axis=1, how="all")
            df_x = df_x.replace(np.nan, "")
            df_y = df_y.replace("", np.nan)
            df_y = df_y.dropna(axis=1, how="all")
            df_y = df_y.replace(np.nan, "")

            is_cols_equal = set([x.split(".", 1)[0] for x in df_x.columns]) == set(
                [x.split(".", 1)[0] for x in df_y.columns]
            )
            if not is_cols_equal:
                print("x: " + str(df_x.columns))
                print("y: " + str(df_y.columns))
                print(diff(df_x.columns, df_y.columns))
                raise AssertionError("Columns in x do not match those in y")

            # reindex to add contexts for duplicate named columns (i.e. Term Accession Number, Unit, etc.)
            import re

            char_regex = re.compile("Characteristics\[(.*?)\]")
            pv_regex = re.compile("Parameter Value\[(.*?)\]")
            fv_regex = re.compile("Factor Value\[(.*?)\]")
            newcolsx = list()
            for col in df_x.columns:
                newcolsx.append(col)
            for i, col in enumerate(df_x.columns):
                if char_regex.match(col) or pv_regex.match(col) or fv_regex.match(col):
                    try:
                        if "Unit" in df_x.columns[i + 1]:
                            newcolsx[i + 1] = col + "/Unit"
                            if "Term Source REF" in df_x.columns[i + 2]:
                                newcolsx[i + 2] = col + "/Unit/Term Source REF"
                            if "Term Accession Number" in df_x.columns[i + 3]:
                                newcolsx[i + 3] = col + "/Unit/Term Accession Number"
                        elif "Term Source REF" in df_x.columns[i + 1]:
                            newcolsx[i + 1] = col + "/Term Source REF"
                            if "Term Accession Number" in df_x.columns[i + 2]:
                                newcolsx[i + 2] = col + "/Term Accession Number"
                    except IndexError:
                        pass
            df_x.columns = newcolsx
            newcolsy = list()
            for col in df_y.columns:
                newcolsy.append(col)
            for i, col in enumerate(df_y.columns):
                if char_regex.match(col) or pv_regex.match(col) or fv_regex.match(col):
                    try:
                        if "Unit" in df_y.columns[i + 1]:
                            newcolsy[i + 1] = col + "/Unit"
                            if "Term Source REF" in df_y.columns[i + 2]:
                                newcolsy[i + 2] = col + "/Unit/Term Source REF"
                            if "Term Accession Number" in df_y.columns[i + 3]:
                                newcolsy[i + 3] = col + "/Unit/Term Accession Number"
                        elif "Term Source REF" in df_y.columns[i + 1]:
                            newcolsy[i + 1] = col + "/Term Source REF"
                            if "Term Accession Number" in df_y.columns[i + 2]:
                                newcolsy[i + 2] = col + "/Term Accession Number"
                    except IndexError:
                        pass
            df_y.columns = newcolsy
            for colx in df_x.columns:
                for eachx, eachy in zip(df_x.sort_values(by=colx)[colx], df_y.sort_values(by=colx)[colx]):
                    if eachx != eachy:
                        print(df_x[colx])
                        print(df_y[colx])
                        raise AssertionError("Value: " + str(eachx) + ", does not match: " + str(eachy))
            # print("Well, you got here so the files must be same-ish... well done, you!")
            return True
        except AssertionError as e:
            print(str(e))
            return False

예제 #2

파일 보기

파일: utils.py 프로젝트: saravanan-dayalan/isa-api

def assert_tab_content_equal(fp_x, fp_y):
    """
    Test for equality of tab files, only down to level of content - should not be taken as canonical equality, but
    rather that all the expected content matches to both input files, but not the order in which they appear.

    For more precise equality, you will need to apply a configuration
        - use assert_tab_equal_by_config(fp_x, fp_y, config)
    :param fp_x: File descriptor of a ISAtab file
    :param fp_y: File descriptor of another  ISAtab file
    :return: True or False plus any AssertionErrors
    """
    def _assert_df_equal(
            x,
            y):  # need to sort values to loosen up how equality is calculated
        try:
            assert_frame_equal(x.sort_values(by=x.columns[0]),
                               y.sort_values(by=y.columns[0]))
            return True
        except AssertionError as e:
            print(e)
            return False

    from os.path import basename
    if basename(fp_x.name).startswith('i_'):
        df_dict_x = read_investigation_file(fp_x)
        df_dict_y = read_investigation_file(fp_y)
        eq = True
        for k in df_dict_x.keys():
            dfx = df_dict_x[k]
            dfy = df_dict_y[k]
            if not isinstance(dfx, list):
                if not _assert_df_equal(dfx, dfy):
                    eq = False
                    break
            else:
                try:
                    for x, y in zip(sorted(dfx), sorted(dfy)):
                        if not _assert_df_equal(x, y):
                            eq = False
                            break
                except ValueError as e:
                    print(e)
        return eq
    else:

        def diff(a, b):
            b = set(b)
            return [aa for aa in a if aa not in b]

        import numpy as np
        df_x = pd.read_csv(fp_x, sep='\t', encoding='utf-8')
        df_y = pd.read_csv(fp_y, sep='\t', encoding='utf-8')
        try:
            # drop empty columns
            df_x = df_x.replace('', np.nan)
            df_x = df_x.dropna(axis=1, how='all')
            df_x = df_x.replace(np.nan, '')
            df_y = df_y.replace('', np.nan)
            df_y = df_y.dropna(axis=1, how='all')
            df_y = df_y.replace(np.nan, '')

            is_cols_equal = set([
                x.split('.', 1)[0] for x in df_x.columns
            ]) == set([x.split('.', 1)[0] for x in df_y.columns])
            if not is_cols_equal:
                print('x: ' + str(df_x.columns))
                print('y: ' + str(df_y.columns))
                print(diff(df_x.columns, df_y.columns))
                raise AssertionError("Columns in x do not match those in y")

            # reindex to add contexts for duplicate named columns (i.e. Term Accession Number, Unit, etc.)
            import re
            char_regex = re.compile('Characteristics\[(.*?)\]')
            pv_regex = re.compile('Parameter Value\[(.*?)\]')
            fv_regex = re.compile('Factor Value\[(.*?)\]')
            newcolsx = list()
            for col in df_x.columns:
                newcolsx.append(col)
            for i, col in enumerate(df_x.columns):
                if char_regex.match(col) or pv_regex.match(
                        col) or fv_regex.match(col):
                    try:
                        if 'Unit' in df_x.columns[i + 1]:
                            newcolsx[i + 1] = col + '/Unit'
                            if 'Term Source REF' in df_x.columns[i + 2]:
                                newcolsx[i + 2] = col + '/Unit/Term Source REF'
                            if 'Term Accession Number' in df_x.columns[i + 3]:
                                newcolsx[
                                    i +
                                    3] = col + '/Unit/Term Accession Number'
                        elif 'Term Source REF' in df_x.columns[i + 1]:
                            newcolsx[i + 1] = col + '/Term Source REF'
                            if 'Term Accession Number' in df_x.columns[i + 2]:
                                newcolsx[i +
                                         2] = col + '/Term Accession Number'
                    except IndexError:
                        pass
            df_x.columns = newcolsx
            newcolsy = list()
            for col in df_y.columns:
                newcolsy.append(col)
            for i, col in enumerate(df_y.columns):
                if char_regex.match(col) or pv_regex.match(
                        col) or fv_regex.match(col):
                    try:
                        if 'Unit' in df_y.columns[i + 1]:
                            newcolsy[i + 1] = col + '/Unit'
                            if 'Term Source REF' in df_y.columns[i + 2]:
                                newcolsy[i + 2] = col + '/Unit/Term Source REF'
                            if 'Term Accession Number' in df_y.columns[i + 3]:
                                newcolsy[
                                    i +
                                    3] = col + '/Unit/Term Accession Number'
                        elif 'Term Source REF' in df_y.columns[i + 1]:
                            newcolsy[i + 1] = col + '/Term Source REF'
                            if 'Term Accession Number' in df_y.columns[i + 2]:
                                newcolsy[i +
                                         2] = col + '/Term Accession Number'
                    except IndexError:
                        pass
            df_y.columns = newcolsy
            for colx in df_x.columns:
                for eachx, eachy in zip(
                        df_x.sort_values(by=colx)[colx],
                        df_y.sort_values(by=colx)[colx]):
                    if eachx != eachy:
                        print(df_x[colx])
                        print(df_y[colx])
                        raise AssertionError("Value: " + str(eachx) +
                                             ", does not match: " + str(eachy))
            # print("Well, you got here so the files must be same-ish... well done, you!")
            return True
        except AssertionError as e:
            print(str(e))
            return False

예제 #3

파일 보기

def assert_tab_content_equal(fp_x, fp_y):
    """
    Test for equality of tab files, only down to level of content -
    should not be taken as canonical equality, but ather that all the expected
    content matches to both input files, but not the order in which they
    appear.

    For more precise equality, you will need to apply a configuration
        - use assert_tab_equal_by_config(fp_x, fp_y, config)
    :param fp_x: File descriptor of a ISAtab file
    :param fp_y: File descriptor of another  ISAtab file
    :return: True or False plus any AssertionErrors
    """

    def _assert_df_equal(x, y):
        # need to sort values to loosen up how equality is calculated
        try:
            assert_frame_equal(
                x.sort_values(by=x.columns[0]), y.sort_values(by=y.columns[0]))
            return True
        except AssertionError as e:
            lbl = datetime.datetime.now()
            x.to_csv('~/Downloads/test-isa-for-release/expected-{}.csv'.format(lbl))
            y.to_csv('~/Downloads/test-isa-for-release/actual-{}.csv'.format(lbl))
            log.error('Error thrown comparing two dataframes: {}'.format(e))
            log.error('x columns are: {}'.format(x.columns))
            log.error('y columns are: {}'.format(y.columns))
            log.error('x data are: {}'.format(x.to_numpy()))
            log.error('y data are: {}'.format(y.to_numpy()))
            return False

    if basename(fp_x.name).startswith('i_'):
        df_dict_x = read_investigation_file(fp_x)
        df_dict_y = read_investigation_file(fp_y)
        eq = True
        for k in df_dict_x.keys():
            dfx = df_dict_x[k]
            dfy = df_dict_y[k]
            if not isinstance(dfx, list):
                if not _assert_df_equal(dfx, dfy):
                    eq = False
                    break
            else:
                try:
                    for x, y in zip(sorted(dfx), sorted(dfy)):
                        if not _assert_df_equal(x, y):
                            eq = False
                            break
                except ValueError as e:
                    log.error(e)
        return eq
    else:

        def diff(a, b):
            b = set(b)
            return [aa for aa in a if aa not in b]

        import numpy as np
        df_x = pd.read_csv(fp_x, sep='\t', encoding='utf-8')
        df_y = pd.read_csv(fp_y, sep='\t', encoding='utf-8')
        try:
            # drop empty columns
            df_x = df_x.replace('', np.nan)
            df_x = df_x.dropna(axis=1, how='all')
            df_x = df_x.replace(np.nan, '')
            df_y = df_y.replace('', np.nan)
            df_y = df_y.dropna(axis=1, how='all')
            df_y = df_y.replace(np.nan, '')

            is_cols_equal = set(
                [x.split('.', 1)[0] for x in df_x.columns]) == \
                set([x.split('.', 1)[0] for x in df_y.columns])
            if not is_cols_equal:
                log.debug('x: ' + str(df_x.columns))
                log.debug('y: ' + str(df_y.columns))
                log.debug(diff(df_x.columns, df_y.columns))
                raise AssertionError('Columns in x do not match those in y')

            # reindex to add contexts for duplicate named columns
            # (i.e. Term Accession Number, Unit, etc.)
            newcolsx = list()
            for col in df_x.columns:
                newcolsx.append(col)
            for i, col in enumerate(df_x.columns):
                if any(RX.match(col) for RX in (
                        _RX_CHARACTERISTICS, _RX_PARAM_VALUE,
                        _RX_FACTOR_VALUE)):
                    try:
                        if 'Unit' in df_x.columns[i + 1]:
                            newcolsx[i + 1] = col + '/Unit'
                            if 'Term Source REF' in df_x.columns[i + 2]:
                                newcolsx[i + 2] = col + '/Unit/Term Source REF'
                            if 'Term Accession Number' in df_x.columns[i + 3]:
                                newcolsx[i + 3] = col + \
                                    '/Unit/Term Accession Number'
                        elif 'Term Source REF' in df_x.columns[i + 1]:
                            newcolsx[i + 1] = col + '/Term Source REF'
                            if 'Term Accession Number' in df_x.columns[i + 2]:
                                newcolsx[i + 2] = col + \
                                    '/Term Accession Number'
                    except IndexError:
                        pass
            df_x.columns = newcolsx
            newcolsy = list()
            for col in df_y.columns:
                newcolsy.append(col)
            for i, col in enumerate(df_y.columns):
                if any(RX.match(col) for RX in (
                        _RX_CHARACTERISTICS, _RX_PARAM_VALUE,
                        _RX_FACTOR_VALUE)):
                    try:
                        if 'Unit' in df_y.columns[i + 1]:
                            newcolsy[i + 1] = col + '/Unit'
                            if 'Term Source REF' in df_y.columns[i + 2]:
                                newcolsy[i + 2] = col + '/Unit/Term Source REF'
                            if 'Term Accession Number' in df_y.columns[i + 3]:
                                newcolsy[i + 3] = col + \
                                    '/Unit/Term Accession Number'
                        elif 'Term Source REF' in df_y.columns[i + 1]:
                            newcolsy[i + 1] = col + '/Term Source REF'
                            if 'Term Accession Number' in df_y.columns[i + 2]:
                                newcolsy[i + 2] = col + \
                                    '/Term Accession Number'
                    except IndexError:
                        pass
            df_y.columns = newcolsy
            for colx in df_x.columns:
                for eachx, eachy in zip(df_x.sort_values(by=colx)[colx],
                                        df_y.sort_values(by=colx)[colx]):
                    if eachx != eachy:
                        log.debug(df_x[colx])
                        log.debug(df_y[colx])
                        raise AssertionError('Value: ' + str(eachx) +
                                             ', does not match: ' + str(eachy))
            return True
        except AssertionError as e:
            log.error(str(e))
            return False