Пример #1
0
def quantipy_from_decipher(decipher_meta, decipher_data, text_key='main'): 
    """ Converts the given Decipher data (which must have been exported
    in tab-delimited format) to Quantipy-ready meta and data.
    
    Parameters
    ----------
    decipher_meta : str or dict
        Either the path to the Decipher meta document saved as JSON or
        said document read into memory

    decipher_data : str or pandas.DataFrame
        Either the path to the Decipher data saved as tab-delimited text
        said file read into memory

    Returns
    -------
    meta : dict
        The Quantipy meta document

    data : pandas.DataFrame
        The converted data
    """

    # If they're not already in memory, read in the Decipher meta and
    # data files
    if isinstance(decipher_meta, str):
        dmeta = load_json(decipher_meta)
    if isinstance(decipher_data, str):
        data = pd.DataFrame.from_csv(decipher_data, sep='\t')
        data[data.index.name] = data.index

    meta = start_meta(text_key=text_key)

    quotas = {
        'vqtable': {}, 
        'voqtable': {}
    }

    types_map = {
        'text': 'string',
        'number': 'int',
        'float': 'float',
        'single': 'single',
        'multiple': 'delimited set'
    }

    # Create generator for compound questions
    compound_questions = [
        question 
        for question in dmeta['questions'] 
        if len(question['variables']) > 1]
    
    # Get basic variables
    for var in dmeta['variables']:
        
        # Collect quota variables
        # These will be dealt with later
        for qtable in ['vqtable', 'voqtable']:
            if qtable in var['vgroup']:
                if not var['vgroup'] in quotas[qtable]:
                    quotas[qtable][var['vgroup']] = []
                quotas[qtable][var['vgroup']].append(var)
                continue
        
        # Start the column meta for the current variable
        var_name = var['label']
        column = meta['columns'][var_name] = {
            'type': types_map[var['type']],
            'text': {text_key: var['title']}
        }
        
        # Add meta-mapped path for current column to the 'data file' set
        # object so that the original order of the variables is known
        set_item = 'columns@%s' % (var_name)
        if not set_item in meta['sets']['data file']['items']:
            meta['sets']['data file']['items'].append(set_item)
        
        if var['type']=='single':
            # Get the response values
            column['values'] = get_decipher_values(var['values'], text_key)

    # Manage compound variables (delimited sets, arrays, mixed-type 
    # sets)
    for question in compound_questions:

        if question['type']=='multiple':

            # Construct delimited set
            meta, data, vgroups, vgroup_variables = make_delimited_set(
                meta, data, question
            )
            
            # If there's only 1 vgroup then this is a basic multiple-
            # choice question and doesn't require construction as an
            # array or set
            if len(vgroups)==1:
                continue

        else:
            # vgroups indicate how many groups of discrete variables sit
            # in the question
            
            # Find the number of variable groups in the set
            vgroups = get_vgroups(question['variables'])        
            
            # For each variable group, get its members
            vgroup_variables = get_vgroup_variables(
                vgroups, question['variables']
            )
        
        # vgroup_types is used to keep track of the types used in the
        # variable group. This will help us identify mixed-type
        # question groups which are not arrays.            
        vgroup_types = get_vgroup_types(vgroups, question['variables'])
        unique_vgroup_types = set(vgroup_types.values())
        
        # Note if the vgroups use more than one variable type
        mixed_types = len(unique_vgroup_types) > 1
        
        if mixed_types:
            # A set should be creted to bind mixed-type variables 
            # together

            vgroup = vgroups[0]
            
            # Create the set
            mask = meta['sets'][vgroup] = {
                'item type': 'mixed',
                'text': {text_key: question['qtitle']},
                'items': [
                    'columns@%s' % (var['label'])
                    for var in question['variables']
                ]
            }        

        if 'multiple' in list(vgroup_types.values()):
            # This is a multiple grid
            # vgroup and vgroup_variables needs to be
            # edited to make them useable in the next step
            # This is related to the structure of multiple
            # response variables in Decipher
            multiple_vgroups = [
                vgroup
                for vgroup in vgroups
                if vgroup_types[vgroup] == 'multiple'
            ]
            vgroup_variables = [copy.copy(vgroups)]
            new_vgroup_match = re.match('(^.+)(?=[c|r][0-9]+)', vgroups[0])
            if new_vgroup_match is None:
                continue
            else:
                vgroups = [new_vgroup_match.group(0)]
                vgroup_types[vgroups[0]] = 'multiple'
        
        # Extract only the vgroups that contain multiple variables
        # so that an array mask can be created for each of them
        array_vgroups = [
            (vgroup, vars)
            for vgroup, vars in zip(vgroups, vgroup_variables)
            if len(vars) > 1
        ]
        
        # If there are any array-like groups of variables inside the
        # question, add an array mask/s accordingly
        for vgroup, vars in array_vgroups:
        
            if vgroup in meta['masks']:
                # This was a multiple-choice grid and has
                # already been converted
                continue
        
            # It's possible the vgroup is in the 'data file' set
            # and needs to be replaced with the name of the group's
            # component vars. This happens with compound questions
            # that are arrays with added open-ends variables
            mapped_vgroup = 'columns@%s' % (vgroup)
            df_items = meta['sets']['data file']['items']
            if mapped_vgroup in df_items:
                mapped_vars = [('columns@%s' % v['label']) for v in vars]
                idx = meta['sets']['data file']['items'].index(mapped_vgroup)
                df_items = df_items[:idx] + mapped_vars + df_items[idx+1:]
                meta['sets']['data file']['items'] = df_items
                    
            # Create the array mask
            mask = meta['masks'][vgroup] = {
                'type': 'array',
                'item type': types_map[vgroup_types[vgroup]],
                'text': {text_key: (
                    '{} - {}'.format(
                        vars[0]['rowTitle'], 
                        question['qtitle']
                    )
                    if vgroup_types[vgroup] in ['number', 'float', 'text']
                    else question['qtitle']
                )},
                'items': [{
                    'source': 'columns@{}'.format(var['label']),
                    'text': {text_key: var['rowTitle']}}
                    for var in vars
                ]}
    
            if vgroup_types[vgroup] in ['single', 'multiple']:
                # Create lib values entry
                values_mapper = 'lib@values@%s' % (vgroup)
                meta['masks'][vgroup]['values'] = values_mapper
                if vgroup_types[vgroup] == 'single':
                    values = get_decipher_values(question['values'], text_key)
                elif vgroup_types[vgroup] == 'multiple':
                    values = copy.deepcopy(meta['columns'][vars[0]]['values'])
                meta['lib']['values'][vgroup] = values
                
                # Use meta-mapped values reference for single or 
                # multiple array variables
                for item in mask['items']:
                    col = item['source'].split('@')[-1]
                    if col in meta['columns']:
                        if 'values' in meta['columns'][col]:
                            meta['columns'][col]['values'] = values_mapper
    
    # Construct quota columns (meta+data)
    meta, data = manage_decipher_quota_variables(meta, data, quotas)

    # Confirm that all meta columns exist in the data
    for col in list(meta['columns'].keys()):
        if not col in data.columns:
            print((
                "Unpaired data warning: {} found in meta['columns']"
                " but not in data.columns. Removing it.".format(col)))
            del meta['columns'][col]
            set_item = 'columns@{}'.format(col)
            if set_item in meta['sets']['data file']['items']:
                idx = meta['sets']['data file']['items'].remove(set_item)

    # Confirm that all data columns exist in the meta
    for col in data.columns:
        if not col in meta['columns']:
            print((
                "Unpaired meta warning: {} found in data.columns"
                " but not in meta['columns']. Removing it.".format(col)))
            data.drop(col, axis=1, inplace=True)

    return meta, data
Пример #2
0
def extract_sav_meta(sav_file,
                     name="",
                     data=None,
                     ioLocale='en_US.UTF-8',
                     ioUtf8=True,
                     dichot=None,
                     dates_as_strings=False,
                     text_key="main"):

    if dichot is None: dichot = {'yes': 1, 'no': 0}
    """ see parse_sav_file doc """
    with sr.SavHeaderReader(sav_file, ioLocale=ioLocale,
                            ioUtf8=ioUtf8) as header:
        # Metadata Attributes
        # ['valueLabels', 'varTypes', 'varSets', 'varAttributes', 'varRoles',
        #  'measureLevels', 'caseWeightVar', 'varNames', 'varLabels', 'formats',
        #  'multRespDefs', 'columnWidths', 'fileAttributes', 'alignments',
        #  'fileLabel', 'missingValues']
        metadata = header.dataDictionary(True)

    meta = start_meta(text_key=text_key)
    meta['info']['text'] = 'Converted from SAV file {}.'.format(name)
    meta['info']['from_source'] = {'pandas_reader': 'sav'}
    meta['sets']['data file']['items'] = [
        'columns@{}'.format(varName) for varName in metadata.varNames
    ]

    # This should probably be somewhere in the metadata
    # weight_variable_name = metadata.caseWeightVar

    # Descriptions of attributes in metadata are are located here :
    # http://pythonhosted.org/savReaderWriter/#savwriter-write-spss-system-files
    for column in metadata.varNames:
        meta['columns'][column] = {}
        meta['columns'][column]['name'] = column
        meta['columns'][column]['parent'] = {}
        if column in metadata.valueLabels:
            # ValueLabels is type = 'single' (possibry 1-1 map)
            meta['columns'][column]['values'] = []
            meta['columns'][column]['type'] = "single"
            for value, text in metadata.valueLabels[column].iteritems():
                values = {
                    'text': {
                        text_key: unicode(text)
                    },
                    'value': int(value)
                }
                meta['columns'][column]['values'].append(values)
        else:
            if column in metadata.formats:
                f = metadata.formats[column]
                if 'DATETIME' in f:
                    if dates_as_strings:
                        # DATETIME fields from SPSS are currently
                        # being read in as strings because there's an
                        # as-yet undetermined discrepancy between the
                        # input and output dates if datetime64 is used
                        meta['columns'][column]['type'] = 'string'
                    else:
                        meta['columns'][column]['type'] = 'date'
                        data[column] = pd.to_datetime(data[column])
                elif f.startswith('A'):
                    meta['columns'][column]['type'] = 'string'
                elif '.' in f:
                    meta['columns'][column]['type'] = "float"
                else:
                    meta['columns'][column]['type'] = "int"
            else:
                # Infer meta from data
                if data is not None:
                    # print "VAR '{}' NOT IN value_labels".format(column)
                    column_values = data[column].dropna()
                    if len(column_values) > 0:
                        # Get the first "not nan" value from the column
                        value = column_values.values[0]
                        if isinstance(value, pd.np.float64):
                            # Float AND Int because savReaderWriter loads them both as float64
                            meta['columns'][column]['text'] = {
                                text_key: [column]
                            }
                            meta['columns'][column]['type'] = "float"
                            if (data[column].dropna() % 1).sum() == 0:
                                if (data[column].dropna() % 1).unique() == [0]:
                                    try:
                                        data[column] = data[column].astype(
                                            'int')
                                    except:
                                        pass
                                    meta['columns'][column]['type'] = "int"

                        elif isinstance(value, unicode) or isinstance(
                                value, str):
                            # Strings
                            meta['columns'][column]['text'] = {
                                text_key: [column]
                            }
                            meta['columns'][column]['type'] = "string"

        if column in metadata.varTypes:
            pass

        if column in metadata.varSets:
            pass

        if column in metadata.varAttributes:
            pass

        if column in metadata.varRoles:
            pass

        if column in metadata.measureLevels:
            pass

        # Some labels are empty strings.
        if column in metadata.varLabels:
            meta['columns'][column]['text'] = {
                text_key: metadata.varLabels[column]
            }

    for mrset in metadata.multRespDefs:
        # meta['masks'][mrset] = {}
        # 'D' is "multiple dichotomy sets" in SPSS
        # 'C' is "multiple category sets" in SPSS
        if metadata.multRespDefs[mrset]['setType'] == 'C':
            'C'
#             meta['masks'][mrset]['type'] = "categorical set"
        elif metadata.multRespDefs[mrset]['setType'] == 'D':
            'D'
            varNames = metadata.multRespDefs[mrset]['varNames']
            # Find the index where there delimited set should be inserted
            # into data, which is immediately prior to the start of the
            # dichotomous set columns
            dls_idx = data.columns.tolist().index(varNames[0])
            # Generate the delimited set from the dichotomous set
            dls = condense_dichotomous_set(data[varNames],
                                           values_from_labels=False,
                                           **dichot)
            # Insert the delimited set into data
            data.insert(dls_idx, mrset, dls)
            # Generate the column meta for the new delimited set
            meta['columns'][mrset] = {
                'name':
                mrset,
                'type':
                'delimited set',
                'text': {
                    text_key: metadata.multRespDefs[mrset]['label']
                },
                'parent': {},
                'values': [{
                    'text': {
                        text_key: metadata.varLabels[varName]
                    },
                    'value': int(v)
                } for v, varName in enumerate(varNames, start=1)]
            }
            # Add the new delimited set to the 'data file' set
            df_items = meta['sets']['data file']['items']
            df_items.insert(df_items.index('columns@{}'.format(varNames[0])),
                            'columns@{}'.format(mrset))

            data = data.drop(varNames, axis=1)
            for varName in varNames:
                df_items.remove('columns@{}'.format(varName))
                del meta['columns'][varName]

    return meta, data
Пример #3
0
def quantipy_from_decipher(decipher_meta, decipher_data, text_key='main'): 
    """ Converts the given Decipher data (which must have been exported
    in tab-delimited format) to Quantipy-ready meta and data.
    
    Parameters
    ----------
    decipher_meta : str or dict
        Either the path to the Decipher meta document saved as JSON or
        said document read into memory

    decipher_data : str or pandas.DataFrame
        Either the path to the Decipher data saved as tab-delimited text
        said file read into memory

    Returns
    -------
    meta : dict
        The Quantipy meta document

    data : pandas.DataFrame
        The converted data
    """

    # If they're not already in memory, read in the Decipher meta and
    # data files
    if isinstance(decipher_meta, (str, unicode)):
        dmeta = load_json(decipher_meta)
    if isinstance(decipher_data, (str, unicode)):
        data = pd.DataFrame.from_csv(decipher_data, sep='\t')

    meta = start_meta(text_key=text_key)

    quotas = {
        'vqtable': {}, 
        'voqtable': {}
    }

    types_map = {
        'text': 'string',
        'number': 'int',
        'float': 'float',
        'single': 'single',
        'multiple': 'delimited set'
    }

    # Get basic variables
    for var in dmeta['variables']:
        
        # Collect quota variables
        # These will be dealt with later
        for qtable in ['vqtable', 'voqtable']:
            if qtable in var['vgroup']:
                if not var['vgroup'] in quotas[qtable]:
                    quotas[qtable][var['vgroup']] = []
                quotas[qtable][var['vgroup']].append(var)
                continue
        
        # Add meta-mapped path for current column to the 'data file' set
        # object so that the original order of the variables is known
        set_item = 'columns@%s' % (var['vgroup'])    
        if not set_item in meta['sets']['data file']['items']:
            meta['sets']['data file']['items'].append(set_item)
        
        # Start the column meta for the current variable
        var_name = var['label']
        column = meta['columns'][var_name] = {
            'type': types_map[var['type']],
            'text': {text_key: var['title']}
        }
        
        if var['type']=='single':
            # Get the response values
            column['values'] = get_decipher_values(var['values'], text_key)

    # Create generator for compound questions
    compound_questions = (
        question 
        for question in dmeta['questions'] 
        if len(question['variables']) > 1
    )

    # Manage compound variables (delimited sets, arrays, mixed-type 
    # sets)
    for question in compound_questions:
        
        if question['type']=='multiple':

            # Construct delimited set
            meta, data, vgroups, vgroup_variables = make_delimited_set(
                meta, data, question
            )
            
            # If there's only 1 vgroup then this is a basic multiple-
            # choice question and doesn't require construction as an
            # array or set
            if len(vgroups)==1:
                continue

        else:
            # vgroups indicate how many groups of discrete variables sit
            # in the question
            
            # Find the number of variable groups in the set
            vgroups = get_vgroups(question['variables'])        
            
            # For each variable group, get its members
            vgroup_variables = get_vgroup_variables(
                vgroups, question['variables']
            )
        
        # vgroup_types is used to keep track of the types used in the
        # variable group. This will help us identify mixed-type
        # question groups which are not arrays.            
        vgroup_types = get_vgroup_types(vgroups, question['variables'])
        unique_vgroup_types = set(vgroup_types.values())
        
        # Note if the vgroups use more than one variable type
        mixed_types = len(unique_vgroup_types) > 1
        
        if mixed_types:
            # A set should be creted to bind mixed-type variables 
            # together

            vgroup = vgroups[0]
            
            # Create the set
            mask = meta['sets'][vgroup] = {
                'item type': 'mixed',
                'text': {text_key: question['qtitle']},
                'items': [
                    'columns@%s' % (var['label'])
                    for var in question['variables']
                ]
            }        

        if 'multiple' in vgroup_types.values():
            # This is a multiple grid
            # vgroup and vgroup_variables needs to be
            # edited to make them useable in the next step
            # This is related to the structure of multiple
            # response variables in Decipher
            multiple_vgroups = [
                vgroup
                for vgroup in vgroups
                if vgroup_types[vgroup] == 'multiple'
            ]
            vgroup_variables = [copy.copy(vgroups)]
            new_vgroup_match = re.match('(^.+)(?=[c|r][0-9]+)', vgroups[0])
            if new_vgroup_match is None:
                continue
            else:
                vgroups = [new_vgroup_match.group(0)]
                vgroup_types[vgroups[0]] = 'multiple'
        
        # Extract only the vgroups that contain multiple variables
        # so that an array mask can be created for each of them
        array_vgroups = [
            (vgroup, vars)
            for vgroup, vars in zip(vgroups, vgroup_variables)
            if len(vars) > 1
        ]
        
        # If there are any array-like groups of variables inside the
        # question, add an array mask/s accordingly
        for vgroup, vars in array_vgroups:
        
            # It's possible the vgroup is in the 'data file' set
            # and needs to be replaced with the name of the group's
            # component vars. This happens with compound questions
            # that are arrays with added open-ends variables
            mapped_vgroup = 'columns@%s' % (vgroup)
            df__items = meta['sets']['data file']['items']
            if mapped_vgroup in df__items:
                mapped_vars = [('columns@%s' % v['label']) for v in vars]
                idx = meta['sets']['data file']['items'].index(mapped_vgroup)
                df__items = df__items[:idx] + mapped_vars + df__items[idx+1:]
                meta['sets']['data file']['items'] = df__items
                    
            # Create the array mask
            mask = meta['masks'][vgroup] = {
                'type': 'array',
                'item type': types_map[vgroup_types[vgroup]],
                'text': {text_key: (
                    '%s - %s' % (
                        vars[0]['rowTitle'], 
                        question['qtitle']
                    )
                    if vgroup_types[vgroup] in ['number', 'float', 'text']
                    else question['qtitle']
                )},
                'items': [
                    'columns@%s' % (
                        var
                        if vgroup_types[vgroup]=='multiple' 
                        else var['label'] 
                    )
                    for var in vars
                ]
            }
    
            if vgroup_types[vgroup] in ['single', 'multiple']:
                # Create lib values entry
                values_mapping = 'lib@values@%s' % (vgroup)
                if vgroup_types[vgroup] == 'single':
                    values = get_decipher_values(question['values'], text_key)
                elif vgroup_types[vgroup] == 'multiple':
                    values = copy.deepcopy(meta['columns'][vars[0]]['values'])
                meta['lib']['values'][vgroup] = values
                
                # Use meta-mapped values reference for single or 
                # multiple array variables
                for item in mask['items']:
                    col = item.split('@')[-1]
                    meta['columns'][col]['values'] = values_mapping
    
    # Construct quota columns (meta+data)
    meta, data = manage_decipher_quota_variables(meta, data, quotas)

    return meta, data
Пример #4
0
def extract_sav_meta(sav_file, name="", data=None, ioLocale='en_US.UTF-8', ioUtf8=True):
    """ see parse_sav_file doc """
    with sr.SavHeaderReader(sav_file, ioLocale=ioLocale, ioUtf8=ioUtf8) as header:
        # Metadata Attributes
        # ['valueLabels', 'varTypes', 'varSets', 'varAttributes', 'varRoles',
        #  'measureLevels', 'caseWeightVar', 'varNames', 'varLabels', 'formats',
        #  'multRespDefs', 'columnWidths', 'fileAttributes', 'alignments',
        #  'fileLabel', 'missingValues']
        metadata = header.dataDictionary(True)

    meta = start_meta(name=name)
    meta['info']['text'] = 'Converted from SAV file %s.' % (name)
    meta['info']['from_source'] = {'pandas_reader':'sav'}
    meta['sets']['data file']['items'] = [
        'columns@%s' % (varName)
        for varName in metadata.varNames
    ]

    # This should probably be somewhere in the metadata
    # weight_variable_name = metadata.caseWeightVar

    # Descriptions of attributes in metadata are are located here :
    # http://pythonhosted.org/savReaderWriter/#savwriter-write-spss-system-files
    for column in metadata.varNames:
        meta['columns'][column] = {}

        if column in metadata.valueLabels:
            # ValueLabels is type = 'single' (possibry 1-1 map)
            meta['columns'][column]['values'] = []
            meta['columns'][column]['type'] = "single"
            for value, text in metadata.valueLabels[column].iteritems():
                values = {'text': {'main': unicode(text)},
                          'value': unicode(int(value))}
                meta['columns'][column]['values'].append(values)
        else:
            if column in metadata.formats:
                f = metadata.formats[column]
                if '.' in f:
                    meta['columns'][column]['type'] = "float"
                else:
                    meta['columns'][column]['type'] = "int"
            else:
                # Infer meta from data
                if data is not None:
                    # print "VAR '{}' NOT IN value_labels".format(column)
                    column_values = data[column].dropna()
                    if len(column_values) > 0:
                        # Get the first "not nan" value from the column
                        value = column_values.values[0]
                        if isinstance(value, pd.np.float64):
                            # Float AND Int because savReaderWriter loads them both as float64
                            meta['columns'][column]['text'] = {'main': [column]}
                            meta['columns'][column]['type'] = "float"
                            if (data[column].dropna() % 1).sum() == 0:
                                if (data[column].dropna() % 1).unique() == [0]:
                                    try:
                                        data[column] = data[column].astype('int')
                                    except:
                                        pass
                                    meta['columns'][column]['type'] = "int"

                        elif isinstance(value, unicode) or isinstance(value, str):
                            # Strings
                            meta['columns'][column]['text'] = {'main': [column]}
                            meta['columns'][column]['type'] = "string"

        if column in metadata.varTypes:
            pass

        if column in metadata.varSets:
            pass

        if column in metadata.varAttributes:
            pass

        if column in metadata.varRoles:
            pass

        if column in metadata.measureLevels:
            pass

        # Some labels are empty strings.
        if column in metadata.varLabels:
            meta['columns'][column]['text'] = {'main': metadata.varLabels[column]}

    for mrset in metadata.multRespDefs:
        # meta['masks'][mrset] = {}
        # 'D' is "multiple dichotomy sets" in SPSS
        # 'C' is "multiple category sets" in SPSS
        if metadata.multRespDefs[mrset]['setType'] == 'C':
            'C'
#             meta['masks'][mrset]['type'] = "categorical set"
        elif metadata.multRespDefs[mrset]['setType'] == 'D':
            'D'
#             meta['masks'][mrset]['type'] = "dichotomous set"
#             meta['masks'][mrset]['countedValue'] = metadata.multRespDefs[mrset]['countedValue']
            varNames = metadata.multRespDefs[mrset]['varNames']
#             meta, data[mrset] = delimited_from_dichotomous(meta, data[varNames], mrset)
            data[mrset] = condense_dichotomous_set(data[varNames], values_from_labels=False)
            meta['columns'][mrset] = {
                'type': 'delimited set',
                'text': {'main': metadata.multRespDefs[mrset]['label']},
                'values': [
                    {
                        'text': {'main': metadata.varLabels[varName]},
                        'value': v
                    }
                    for v, varName in enumerate(varNames, start=1)
                ]
            }
            idx = meta['sets']['data file']['items'].index('columns@%s' % (varNames[0]))
            items = meta['sets']['data file']['items']
            meta['sets']['data file']['items'] = items[:idx] + ['columns@%s' % (mrset)] + items[idx+len(varNames):]
            
#         meta['masks'][mrset]['text'] = [metadata.multRespDefs[mrset]['label']]
#         meta['masks'][mrset]['items'] = []
#         for var_name in metadata.multRespDefs[mrset]['varNames']:
#             meta['masks'][mrset]['items'].append({'source':"columns@{0}".format(var_name)})

        # df = make_delimited_from_dichotmous(data[common_vars[var]])

    return meta, data
Пример #5
0
def quantipy_from_ascribe(path_xml, path_txt, text_key='main'):
 
    # Read the AScribe data (tab-delimited)
    meta_ascribe = xmltodict.parse(open(path_xml))
    data_ascribe = pd.DataFrame.from_csv(
        path_txt, 
        sep='\t', 
        header=0, 
        encoding='utf-16'
    )
     
    # Start a Quantipy meta document
    meta = start_meta(text_key=text_key)
    meta['columns']['responseid'] = {
        'type': 'int',
        'text': {text_key: 'responseid'}
    }
    
    # Container to record the names, in order, of the resulting
    # coded columns
    coded_names = []
     
    for var in meta_ascribe['CodedQuestions']['MultiForm']:
        name = var['Name']
        coded_names.append(name)
        coded_from = var['FormTexts']['FormText']['Title']
        var_text = var['FormTexts']['FormText']['Text']
        if var_text is None: var_text = 'Label not provided'
        var_text = {text_key: var_text}
        columns = []
        values = []
        for val in var['Answers']['Answer']:
            value = int(val['@Precode'])
            if value==0:
                msg = (
                    "The value 0 has been assigned to a code for the "
                    "variable '%s'."
                ) % (name)
                warnings.warn(msg)
            val_text = val['Texts']['Text']['#text']
            if val_text is None: val_text = 'Label not provided'
            val_text = {text_key: val_text}
            values.append({'value': value, 'text': val_text})
            columns.append('%s_%s' % (name, value))
             
        # Create a single series from the dichotomous set
        data_ascribe[name] = condense_dichotomous_set(
            data_ascribe[columns], 
            sniff_single=True
        )
         
        # Determine the Quantipy type of the returned
        # series from its dtype (see 'sniff_sinlge' in 
        # condense_dichotomous_set()
        if data_ascribe[columns].sum(axis=1).max()==1:
            col_type = 'single'    
        else:
            col_type = 'delimited set'
             
        # Create the new Quantipy column meta 
        column = {
            'type': col_type,
            'text': var_text,
            'values': values
        }
         
        # Add the newly defined column to the Quantipy meta
        meta['columns'][name] = column
        meta['sets']['data file']['items'] = [
            'columns@%s' % (col_name)
            for col_name in coded_names
        ]
     
    # Keep only the slice that has been converted.
    data = data_ascribe[coded_names]

    return meta, data
Пример #6
0
def quantipy_from_ascribe(path_xml, path_txt, text_key='main'):

    # Read the AScribe data (tab-delimited)
    meta_ascribe = xmltodict.parse(open(path_xml))
    data_ascribe = pd.DataFrame.from_csv(path_txt,
                                         sep='\t',
                                         header=0,
                                         encoding='utf-16')

    # Start a Quantipy meta document
    meta = start_meta(text_key=text_key)
    meta['columns']['responseid'] = {
        'type': 'int',
        'text': {
            text_key: 'responseid'
        }
    }

    # Container to record the names, in order, of the resulting
    # coded columns
    coded_names = []

    for var in meta_ascribe['CodedQuestions']['MultiForm']:
        name = var['Name']
        coded_names.append(name)
        coded_from = var['FormTexts']['FormText']['Title']
        var_text = var['FormTexts']['FormText']['Text']
        if var_text is None: var_text = 'Label not provided'
        var_text = {text_key: var_text}
        columns = []
        values = []
        for val in var['Answers']['Answer']:
            value = int(val['@Precode'])
            if value == 0:
                msg = ("The value 0 has been assigned to a code for the "
                       "variable '%s'.") % (name)
                warnings.warn(msg)
            val_text = val['Texts']['Text']['#text']
            if val_text is None: val_text = 'Label not provided'
            val_text = {text_key: val_text}
            values.append({'value': value, 'text': val_text})
            columns.append('%s_%s' % (name, value))

        # Create a single series from the dichotomous set
        data_ascribe[name] = condense_dichotomous_set(data_ascribe[columns],
                                                      sniff_single=True)

        # Determine the Quantipy type of the returned
        # series from its dtype (see 'sniff_sinlge' in
        # condense_dichotomous_set()
        if data_ascribe[columns].sum(axis=1).max() == 1:
            col_type = 'single'
        else:
            col_type = 'delimited set'

        # Create the new Quantipy column meta
        column = {'type': col_type, 'text': var_text, 'values': values}

        # Add the newly defined column to the Quantipy meta
        meta['columns'][name] = column
        meta['sets']['data file']['items'] = [
            'columns@%s' % (col_name) for col_name in coded_names
        ]

    # Keep only the slice that has been converted.
    data = data_ascribe[coded_names]

    return meta, data
Пример #7
0
def extract_sav_meta(sav_file,
                     name="",
                     data=None,
                     ioLocale='en_US.UTF-8',
                     ioUtf8=True,
                     dichot=None,
                     dates_as_strings=False,
                     text_key="en-GB",
                     engine='savReaderWriter'):

    if engine == 'readstat':
        df, metadata = pyreadstat.read_sav(sav_file,
                                           encoding=ioLocale.split(".")[-1],
                                           metadataonly=True)
        meta = start_meta(text_key=text_key)

        meta['info']['text'] = 'Converted from SAV file {}.'.format(name)
        meta['info']['from_source'] = {'pandas_reader': 'sav'}
        meta['sets']['data file']['items'] = [
            'columns@{}'.format(varName) for varName in metadata.column_names
        ]

        for index, column in enumerate(metadata.column_names):
            meta['columns'][column] = {}
            meta['columns'][column]['name'] = column
            meta['columns'][column]['parent'] = {}
            if column in metadata.variable_value_labels:
                meta['columns'][column]['values'] = []
                meta['columns'][column]['type'] = "single"
                for value, text in metadata.variable_value_labels[
                        column].items():
                    values = {
                        'text': {
                            text_key: str(text)
                        },
                        'value': int(value)
                    }
                    meta['columns'][column]['values'].append(values)
                    # if user has stored single answer data as a string rather than number
                    # we convert it to floats and store non convertables as nan (with coerce)
                    if column in data.columns and data[column].dtype == 'O':
                        data[column] = pd.to_numeric(data[column],
                                                     errors='coerce',
                                                     downcast='float')
            else:
                if column in metadata.original_variable_types:
                    f = metadata.original_variable_types[column]
                    if 'DATETIME' in f:
                        if dates_as_strings:
                            # DATETIME fields from SPSS are currently
                            # being read in as strings because there's an
                            # as-yet undetermined discrepancy between the
                            # input and output dates if datetime64 is used
                            meta['columns'][column]['type'] = 'string'
                        else:
                            meta['columns'][column]['type'] = 'date'
                            data[column] = pd.to_datetime(data[column])
                    elif f.startswith('A'):
                        meta['columns'][column]['type'] = 'string'
                    elif '.' in f:
                        meta['columns'][column]['type'] = "float"
                    else:
                        meta['columns'][column]['type'] = "int"

            # add the variable label to the meta
            meta['columns'][column]['text'] = {
                text_key: metadata.column_labels[index]
            }
        return meta, data

    elif engine == 'savReaderWriter':
        if dichot is None: dichot = {'yes': 1, 'no': 0}
        """ see parse_sav_file doc """
        with sr.SavHeaderReader(sav_file, ioLocale=ioLocale,
                                ioUtf8=ioUtf8) as header:
            # Metadata Attributes
            # ['valueLabels', 'varTypes', 'varSets', 'varAttributes', 'varRoles',
            #  'measureLevels', 'caseWeightVar', 'varNames', 'varLabels', 'formats',
            #  'multRespDefs', 'columnWidths', 'fileAttributes', 'alignments',
            #  'fileLabel', 'missingValues']
            metadata = header.dataDictionary(True)

        meta = start_meta(text_key=text_key)
        meta['info']['text'] = 'Converted from SAV file {}.'.format(name)
        meta['info']['from_source'] = {'pandas_reader': 'sav'}
        meta['sets']['data file']['items'] = [
            'columns@{}'.format(varName) for varName in metadata.varNames
        ]

        # This should probably be somewhere in the metadata
        # weight_variable_name = metadata.caseWeightVar

        # Descriptions of attributes in metadata are are located here :
        # http://pythonhosted.org/savReaderWriter/#savwriter-write-spss-system-files
        for column in metadata.varNames:
            meta['columns'][column] = {}
            meta['columns'][column]['name'] = column
            meta['columns'][column]['parent'] = {}
            if column in metadata.valueLabels:
                # ValueLabels is type = 'single' (possibry 1-1 map)
                meta['columns'][column]['values'] = []
                meta['columns'][column]['type'] = "single"
                for value, text in metadata.valueLabels[column].items():
                    values = {
                        'text': {
                            text_key: str(text)
                        },
                        'value': int(value)
                    }
                    meta['columns'][column]['values'].append(values)
            else:
                if column in metadata.formats:
                    f = metadata.formats[column]
                    if 'DATETIME' in f:
                        if dates_as_strings:
                            # DATETIME fields from SPSS are currently
                            # being read in as strings because there's an
                            # as-yet undetermined discrepancy between the
                            # input and output dates if datetime64 is used
                            meta['columns'][column]['type'] = 'string'
                        else:
                            meta['columns'][column]['type'] = 'date'
                            data[column] = pd.to_datetime(data[column])
                    elif f.startswith('A'):
                        meta['columns'][column]['type'] = 'string'
                    elif '.' in f:
                        meta['columns'][column]['type'] = "float"
                    else:
                        meta['columns'][column]['type'] = "int"
                else:
                    # Infer meta from data
                    if data is not None:
                        # print "VAR '{}' NOT IN value_labels".format(column)
                        column_values = data[column].dropna()
                        if len(column_values) > 0:
                            # Get the first "not nan" value from the column
                            value = column_values.values[0]
                            if isinstance(value, pd.np.float64):
                                # Float AND Int because savReaderWriter loads them both as float64
                                meta['columns'][column]['text'] = {
                                    text_key: [column]
                                }
                                meta['columns'][column]['type'] = "float"
                                if (data[column].dropna() % 1).sum() == 0:
                                    if (data[column].dropna() %
                                            1).unique() == [0]:
                                        try:
                                            data[column] = data[column].astype(
                                                'int')
                                        except:
                                            pass
                                        meta['columns'][column]['type'] = "int"

                            elif isinstance(value, str) or isinstance(
                                    value, str):
                                # Strings
                                meta['columns'][column]['text'] = {
                                    text_key: [column]
                                }
                                meta['columns'][column]['type'] = "string"

            if column in metadata.varTypes:
                pass

            if column in metadata.varSets:
                pass

            if column in metadata.varAttributes:
                pass

            if column in metadata.varRoles:
                pass

            if column in metadata.measureLevels:
                pass

            # Some labels are empty strings.note
            if column in metadata.varLabels:
                meta['columns'][column]['text'] = {
                    text_key: metadata.varLabels[column]
                }

        for mrset in metadata.multRespDefs:
            # meta['masks'][mrset] = {}
            # 'D' is "multiple dichotomy sets" in SPSS
            # 'C' is "multiple category sets" in SPSS
            varNames = list(metadata.multRespDefs[mrset]['varNames'])
            # Find the index where there delimited set should be inserted
            # into data, which is immediately prior to the start of the
            # dichotomous set columns
            dls_idx = data.columns.tolist().index(varNames[0])
            if metadata.multRespDefs[mrset]['setType'] == 'C':
                # Raise if value object of columns is not equal
                if not all(meta['columns'][v]['values'] == meta['columns'][
                        varNames[0]]['values'] for v in varNames):
                    msg = 'Columns must have equal values to be combined in a set: {}'
                    raise ValueError(msg.format(varNames))
                # Concatenate columns to set
                df_str = data[varNames].astype('str')
                dls = df_str.apply(lambda x: ';'.join([
                    v.replace('.0', '')
                    for v in x.tolist() if not v in ['nan', 'None']
                ]),
                                   axis=1) + ';'
                dls.replace({';': np.NaN}, inplace=True)
                # Get value object
                values = meta['columns'][varNames[0]]['values']

            elif metadata.multRespDefs[mrset]['setType'] == 'D':
                # Generate the delimited set from the dichotomous set
                dls = condense_dichotomous_set(data[varNames],
                                               values_from_labels=False,
                                               **dichot)
                # Get value object
                values = [{
                    'text': {
                        text_key: metadata.varLabels[varName]
                    },
                    'value': int(v)
                } for v, varName in enumerate(varNames, start=1)]
            else:
                continue
            # Insert the delimited set into data
            data.insert(dls_idx, mrset, dls)
            # Generate the column meta for the new delimited set
            meta['columns'][mrset] = {
                'name': mrset,
                'type': 'delimited set',
                'text': {
                    text_key: metadata.multRespDefs[mrset]['label']
                },
                'parent': {},
                'values': values
            }
            # Add the new delimited set to the 'data file' set
            df_items = meta['sets']['data file']['items']
            df_items.insert(df_items.index('columns@{}'.format(varNames[0])),
                            'columns@{}'.format(mrset))

            data = data.drop(varNames, axis=1)
            for varName in varNames:
                df_items.remove('columns@{}'.format(varName))
                del meta['columns'][varName]

        return meta, data