def delimited_from_dichotomous(meta, df, name): """ Takes df, which should contain one or more columns of dichotomous data (as 0s/1s) related to the same set of response options, and returns a single series. The returned series will be a delimited set if necessary, but if there is only 1 column in df or the responses indicated in the data are mutually exclusive then a normal 'single' series will be returned instead and the meta type for that column will be adjusted to single. Parameters ---------- meta : dict The meta document paired to the data being converted df : pandas.DataFrame The column/s in the dichotomous set. This may be a single-column DataFrame, in which case a non-delimited set will be returned. name : str The relevant key name for the resulting column in meta['columns'] Returns ------- meta : dict The meta document paired to the data being converted series: pandas.series The converted series """ if df.shape[1]==1: # The set has only 1 possible response # Convert to single series = df.replace(0, np.NaN) # Update type in meta meta['columns'][name]['type'] = 'single' return meta, series elif all([v<=1 for v in df.sum(axis=1)]): # The set values are mutually exclusive # Convert to single df = df.copy() for v, col in enumerate(df.columns, start=1): # Convert to categorical set df[v] = df[col].replace(1, v) del df[col] series = df.sum(axis=1).replace(0, np.NaN) # Update type in meta meta['columns'][name]['type'] = 'single' return meta, series else: series = condense_dichotomous_set(df, values_from_labels=False) return meta, series
def delimited_from_dichotomous(meta, df, name): """ Takes df, which should contain one or more columns of dichotomous data (as 0s/1s) related to the same set of response options, and returns a single series. The returned series will be a delimited set if necessary, but if there is only 1 column in df or the responses indicated in the data are mutually exclusive then a normal 'single' series will be returned instead and the meta type for that column will be adjusted to single. Parameters ---------- meta : dict The meta document paired to the data being converted df : pandas.DataFrame The column/s in the dichotomous set. This may be a single-column DataFrame, in which case a non-delimited set will be returned. name : str The relevant key name for the resulting column in meta['columns'] Returns ------- meta : dict The meta document paired to the data being converted series: pandas.series The converted series """ if df.shape[1] == 1: # The set has only 1 possible response # Convert to single series = df.replace(0, np.NaN) # Update type in meta meta['columns'][name]['type'] = 'single' return meta, series elif all([v <= 1 for v in df.sum(axis=1)]): # The set values are mutually exclusive # Convert to single df = df.copy() for v, col in enumerate(df.columns, start=1): # Convert to categorical set df[v] = df[col].replace(1, v) del df[col] series = df.sum(axis=1).replace(0, np.NaN) # Update type in meta meta['columns'][name]['type'] = 'single' return meta, series else: series = condense_dichotomous_set(df, values_from_labels=False) return meta, series
def extract_sav_meta(sav_file, name="", data=None, ioLocale='en_US.UTF-8', ioUtf8=True, dichot=None, dates_as_strings=False, text_key="main"): if dichot is None: dichot = {'yes': 1, 'no': 0} """ see parse_sav_file doc """ with sr.SavHeaderReader(sav_file, ioLocale=ioLocale, ioUtf8=ioUtf8) as header: # Metadata Attributes # ['valueLabels', 'varTypes', 'varSets', 'varAttributes', 'varRoles', # 'measureLevels', 'caseWeightVar', 'varNames', 'varLabels', 'formats', # 'multRespDefs', 'columnWidths', 'fileAttributes', 'alignments', # 'fileLabel', 'missingValues'] metadata = header.dataDictionary(True) meta = start_meta(text_key=text_key) meta['info']['text'] = 'Converted from SAV file {}.'.format(name) meta['info']['from_source'] = {'pandas_reader': 'sav'} meta['sets']['data file']['items'] = [ 'columns@{}'.format(varName) for varName in metadata.varNames ] # This should probably be somewhere in the metadata # weight_variable_name = metadata.caseWeightVar # Descriptions of attributes in metadata are are located here : # http://pythonhosted.org/savReaderWriter/#savwriter-write-spss-system-files for column in metadata.varNames: meta['columns'][column] = {} meta['columns'][column]['name'] = column meta['columns'][column]['parent'] = {} if column in metadata.valueLabels: # ValueLabels is type = 'single' (possibry 1-1 map) meta['columns'][column]['values'] = [] meta['columns'][column]['type'] = "single" for value, text in metadata.valueLabels[column].iteritems(): values = { 'text': { text_key: unicode(text) }, 'value': int(value) } meta['columns'][column]['values'].append(values) else: if column in metadata.formats: f = metadata.formats[column] if 'DATETIME' in f: if dates_as_strings: # DATETIME fields from SPSS are currently # being read in as strings because there's an # as-yet undetermined discrepancy between the # input and output dates if datetime64 is used meta['columns'][column]['type'] = 'string' else: meta['columns'][column]['type'] = 'date' data[column] = pd.to_datetime(data[column]) elif f.startswith('A'): meta['columns'][column]['type'] = 'string' elif '.' in f: meta['columns'][column]['type'] = "float" else: meta['columns'][column]['type'] = "int" else: # Infer meta from data if data is not None: # print "VAR '{}' NOT IN value_labels".format(column) column_values = data[column].dropna() if len(column_values) > 0: # Get the first "not nan" value from the column value = column_values.values[0] if isinstance(value, pd.np.float64): # Float AND Int because savReaderWriter loads them both as float64 meta['columns'][column]['text'] = { text_key: [column] } meta['columns'][column]['type'] = "float" if (data[column].dropna() % 1).sum() == 0: if (data[column].dropna() % 1).unique() == [0]: try: data[column] = data[column].astype( 'int') except: pass meta['columns'][column]['type'] = "int" elif isinstance(value, unicode) or isinstance( value, str): # Strings meta['columns'][column]['text'] = { text_key: [column] } meta['columns'][column]['type'] = "string" if column in metadata.varTypes: pass if column in metadata.varSets: pass if column in metadata.varAttributes: pass if column in metadata.varRoles: pass if column in metadata.measureLevels: pass # Some labels are empty strings. if column in metadata.varLabels: meta['columns'][column]['text'] = { text_key: metadata.varLabels[column] } for mrset in metadata.multRespDefs: # meta['masks'][mrset] = {} # 'D' is "multiple dichotomy sets" in SPSS # 'C' is "multiple category sets" in SPSS if metadata.multRespDefs[mrset]['setType'] == 'C': 'C' # meta['masks'][mrset]['type'] = "categorical set" elif metadata.multRespDefs[mrset]['setType'] == 'D': 'D' varNames = metadata.multRespDefs[mrset]['varNames'] # Find the index where there delimited set should be inserted # into data, which is immediately prior to the start of the # dichotomous set columns dls_idx = data.columns.tolist().index(varNames[0]) # Generate the delimited set from the dichotomous set dls = condense_dichotomous_set(data[varNames], values_from_labels=False, **dichot) # Insert the delimited set into data data.insert(dls_idx, mrset, dls) # Generate the column meta for the new delimited set meta['columns'][mrset] = { 'name': mrset, 'type': 'delimited set', 'text': { text_key: metadata.multRespDefs[mrset]['label'] }, 'parent': {}, 'values': [{ 'text': { text_key: metadata.varLabels[varName] }, 'value': int(v) } for v, varName in enumerate(varNames, start=1)] } # Add the new delimited set to the 'data file' set df_items = meta['sets']['data file']['items'] df_items.insert(df_items.index('columns@{}'.format(varNames[0])), 'columns@{}'.format(mrset)) data = data.drop(varNames, axis=1) for varName in varNames: df_items.remove('columns@{}'.format(varName)) del meta['columns'][varName] return meta, data
def extract_sav_meta(sav_file, name="", data=None, ioLocale='en_US.UTF-8', ioUtf8=True): """ see parse_sav_file doc """ with sr.SavHeaderReader(sav_file, ioLocale=ioLocale, ioUtf8=ioUtf8) as header: # Metadata Attributes # ['valueLabels', 'varTypes', 'varSets', 'varAttributes', 'varRoles', # 'measureLevels', 'caseWeightVar', 'varNames', 'varLabels', 'formats', # 'multRespDefs', 'columnWidths', 'fileAttributes', 'alignments', # 'fileLabel', 'missingValues'] metadata = header.dataDictionary(True) meta = start_meta(name=name) meta['info']['text'] = 'Converted from SAV file %s.' % (name) meta['info']['from_source'] = {'pandas_reader':'sav'} meta['sets']['data file']['items'] = [ 'columns@%s' % (varName) for varName in metadata.varNames ] # This should probably be somewhere in the metadata # weight_variable_name = metadata.caseWeightVar # Descriptions of attributes in metadata are are located here : # http://pythonhosted.org/savReaderWriter/#savwriter-write-spss-system-files for column in metadata.varNames: meta['columns'][column] = {} if column in metadata.valueLabels: # ValueLabels is type = 'single' (possibry 1-1 map) meta['columns'][column]['values'] = [] meta['columns'][column]['type'] = "single" for value, text in metadata.valueLabels[column].iteritems(): values = {'text': {'main': unicode(text)}, 'value': unicode(int(value))} meta['columns'][column]['values'].append(values) else: if column in metadata.formats: f = metadata.formats[column] if '.' in f: meta['columns'][column]['type'] = "float" else: meta['columns'][column]['type'] = "int" else: # Infer meta from data if data is not None: # print "VAR '{}' NOT IN value_labels".format(column) column_values = data[column].dropna() if len(column_values) > 0: # Get the first "not nan" value from the column value = column_values.values[0] if isinstance(value, pd.np.float64): # Float AND Int because savReaderWriter loads them both as float64 meta['columns'][column]['text'] = {'main': [column]} meta['columns'][column]['type'] = "float" if (data[column].dropna() % 1).sum() == 0: if (data[column].dropna() % 1).unique() == [0]: try: data[column] = data[column].astype('int') except: pass meta['columns'][column]['type'] = "int" elif isinstance(value, unicode) or isinstance(value, str): # Strings meta['columns'][column]['text'] = {'main': [column]} meta['columns'][column]['type'] = "string" if column in metadata.varTypes: pass if column in metadata.varSets: pass if column in metadata.varAttributes: pass if column in metadata.varRoles: pass if column in metadata.measureLevels: pass # Some labels are empty strings. if column in metadata.varLabels: meta['columns'][column]['text'] = {'main': metadata.varLabels[column]} for mrset in metadata.multRespDefs: # meta['masks'][mrset] = {} # 'D' is "multiple dichotomy sets" in SPSS # 'C' is "multiple category sets" in SPSS if metadata.multRespDefs[mrset]['setType'] == 'C': 'C' # meta['masks'][mrset]['type'] = "categorical set" elif metadata.multRespDefs[mrset]['setType'] == 'D': 'D' # meta['masks'][mrset]['type'] = "dichotomous set" # meta['masks'][mrset]['countedValue'] = metadata.multRespDefs[mrset]['countedValue'] varNames = metadata.multRespDefs[mrset]['varNames'] # meta, data[mrset] = delimited_from_dichotomous(meta, data[varNames], mrset) data[mrset] = condense_dichotomous_set(data[varNames], values_from_labels=False) meta['columns'][mrset] = { 'type': 'delimited set', 'text': {'main': metadata.multRespDefs[mrset]['label']}, 'values': [ { 'text': {'main': metadata.varLabels[varName]}, 'value': v } for v, varName in enumerate(varNames, start=1) ] } idx = meta['sets']['data file']['items'].index('columns@%s' % (varNames[0])) items = meta['sets']['data file']['items'] meta['sets']['data file']['items'] = items[:idx] + ['columns@%s' % (mrset)] + items[idx+len(varNames):] # meta['masks'][mrset]['text'] = [metadata.multRespDefs[mrset]['label']] # meta['masks'][mrset]['items'] = [] # for var_name in metadata.multRespDefs[mrset]['varNames']: # meta['masks'][mrset]['items'].append({'source':"columns@{0}".format(var_name)}) # df = make_delimited_from_dichotmous(data[common_vars[var]]) return meta, data
def quantipy_from_ascribe(path_xml, path_txt, text_key='main'): # Read the AScribe data (tab-delimited) meta_ascribe = xmltodict.parse(open(path_xml)) data_ascribe = pd.DataFrame.from_csv( path_txt, sep='\t', header=0, encoding='utf-16' ) # Start a Quantipy meta document meta = start_meta(text_key=text_key) meta['columns']['responseid'] = { 'type': 'int', 'text': {text_key: 'responseid'} } # Container to record the names, in order, of the resulting # coded columns coded_names = [] for var in meta_ascribe['CodedQuestions']['MultiForm']: name = var['Name'] coded_names.append(name) coded_from = var['FormTexts']['FormText']['Title'] var_text = var['FormTexts']['FormText']['Text'] if var_text is None: var_text = 'Label not provided' var_text = {text_key: var_text} columns = [] values = [] for val in var['Answers']['Answer']: value = int(val['@Precode']) if value==0: msg = ( "The value 0 has been assigned to a code for the " "variable '%s'." ) % (name) warnings.warn(msg) val_text = val['Texts']['Text']['#text'] if val_text is None: val_text = 'Label not provided' val_text = {text_key: val_text} values.append({'value': value, 'text': val_text}) columns.append('%s_%s' % (name, value)) # Create a single series from the dichotomous set data_ascribe[name] = condense_dichotomous_set( data_ascribe[columns], sniff_single=True ) # Determine the Quantipy type of the returned # series from its dtype (see 'sniff_sinlge' in # condense_dichotomous_set() if data_ascribe[columns].sum(axis=1).max()==1: col_type = 'single' else: col_type = 'delimited set' # Create the new Quantipy column meta column = { 'type': col_type, 'text': var_text, 'values': values } # Add the newly defined column to the Quantipy meta meta['columns'][name] = column meta['sets']['data file']['items'] = [ 'columns@%s' % (col_name) for col_name in coded_names ] # Keep only the slice that has been converted. data = data_ascribe[coded_names] return meta, data
def quantipy_from_ascribe(path_xml, path_txt, text_key='main'): # Read the AScribe data (tab-delimited) meta_ascribe = xmltodict.parse(open(path_xml)) data_ascribe = pd.DataFrame.from_csv(path_txt, sep='\t', header=0, encoding='utf-16') # Start a Quantipy meta document meta = start_meta(text_key=text_key) meta['columns']['responseid'] = { 'type': 'int', 'text': { text_key: 'responseid' } } # Container to record the names, in order, of the resulting # coded columns coded_names = [] for var in meta_ascribe['CodedQuestions']['MultiForm']: name = var['Name'] coded_names.append(name) coded_from = var['FormTexts']['FormText']['Title'] var_text = var['FormTexts']['FormText']['Text'] if var_text is None: var_text = 'Label not provided' var_text = {text_key: var_text} columns = [] values = [] for val in var['Answers']['Answer']: value = int(val['@Precode']) if value == 0: msg = ("The value 0 has been assigned to a code for the " "variable '%s'.") % (name) warnings.warn(msg) val_text = val['Texts']['Text']['#text'] if val_text is None: val_text = 'Label not provided' val_text = {text_key: val_text} values.append({'value': value, 'text': val_text}) columns.append('%s_%s' % (name, value)) # Create a single series from the dichotomous set data_ascribe[name] = condense_dichotomous_set(data_ascribe[columns], sniff_single=True) # Determine the Quantipy type of the returned # series from its dtype (see 'sniff_sinlge' in # condense_dichotomous_set() if data_ascribe[columns].sum(axis=1).max() == 1: col_type = 'single' else: col_type = 'delimited set' # Create the new Quantipy column meta column = {'type': col_type, 'text': var_text, 'values': values} # Add the newly defined column to the Quantipy meta meta['columns'][name] = column meta['sets']['data file']['items'] = [ 'columns@%s' % (col_name) for col_name in coded_names ] # Keep only the slice that has been converted. data = data_ascribe[coded_names] return meta, data
def extract_sav_meta(sav_file, name="", data=None, ioLocale='en_US.UTF-8', ioUtf8=True, dichot=None, dates_as_strings=False, text_key="en-GB", engine='savReaderWriter'): if engine == 'readstat': df, metadata = pyreadstat.read_sav(sav_file, encoding=ioLocale.split(".")[-1], metadataonly=True) meta = start_meta(text_key=text_key) meta['info']['text'] = 'Converted from SAV file {}.'.format(name) meta['info']['from_source'] = {'pandas_reader': 'sav'} meta['sets']['data file']['items'] = [ 'columns@{}'.format(varName) for varName in metadata.column_names ] for index, column in enumerate(metadata.column_names): meta['columns'][column] = {} meta['columns'][column]['name'] = column meta['columns'][column]['parent'] = {} if column in metadata.variable_value_labels: meta['columns'][column]['values'] = [] meta['columns'][column]['type'] = "single" for value, text in metadata.variable_value_labels[ column].items(): values = { 'text': { text_key: str(text) }, 'value': int(value) } meta['columns'][column]['values'].append(values) # if user has stored single answer data as a string rather than number # we convert it to floats and store non convertables as nan (with coerce) if column in data.columns and data[column].dtype == 'O': data[column] = pd.to_numeric(data[column], errors='coerce', downcast='float') else: if column in metadata.original_variable_types: f = metadata.original_variable_types[column] if 'DATETIME' in f: if dates_as_strings: # DATETIME fields from SPSS are currently # being read in as strings because there's an # as-yet undetermined discrepancy between the # input and output dates if datetime64 is used meta['columns'][column]['type'] = 'string' else: meta['columns'][column]['type'] = 'date' data[column] = pd.to_datetime(data[column]) elif f.startswith('A'): meta['columns'][column]['type'] = 'string' elif '.' in f: meta['columns'][column]['type'] = "float" else: meta['columns'][column]['type'] = "int" # add the variable label to the meta meta['columns'][column]['text'] = { text_key: metadata.column_labels[index] } return meta, data elif engine == 'savReaderWriter': if dichot is None: dichot = {'yes': 1, 'no': 0} """ see parse_sav_file doc """ with sr.SavHeaderReader(sav_file, ioLocale=ioLocale, ioUtf8=ioUtf8) as header: # Metadata Attributes # ['valueLabels', 'varTypes', 'varSets', 'varAttributes', 'varRoles', # 'measureLevels', 'caseWeightVar', 'varNames', 'varLabels', 'formats', # 'multRespDefs', 'columnWidths', 'fileAttributes', 'alignments', # 'fileLabel', 'missingValues'] metadata = header.dataDictionary(True) meta = start_meta(text_key=text_key) meta['info']['text'] = 'Converted from SAV file {}.'.format(name) meta['info']['from_source'] = {'pandas_reader': 'sav'} meta['sets']['data file']['items'] = [ 'columns@{}'.format(varName) for varName in metadata.varNames ] # This should probably be somewhere in the metadata # weight_variable_name = metadata.caseWeightVar # Descriptions of attributes in metadata are are located here : # http://pythonhosted.org/savReaderWriter/#savwriter-write-spss-system-files for column in metadata.varNames: meta['columns'][column] = {} meta['columns'][column]['name'] = column meta['columns'][column]['parent'] = {} if column in metadata.valueLabels: # ValueLabels is type = 'single' (possibry 1-1 map) meta['columns'][column]['values'] = [] meta['columns'][column]['type'] = "single" for value, text in metadata.valueLabels[column].items(): values = { 'text': { text_key: str(text) }, 'value': int(value) } meta['columns'][column]['values'].append(values) else: if column in metadata.formats: f = metadata.formats[column] if 'DATETIME' in f: if dates_as_strings: # DATETIME fields from SPSS are currently # being read in as strings because there's an # as-yet undetermined discrepancy between the # input and output dates if datetime64 is used meta['columns'][column]['type'] = 'string' else: meta['columns'][column]['type'] = 'date' data[column] = pd.to_datetime(data[column]) elif f.startswith('A'): meta['columns'][column]['type'] = 'string' elif '.' in f: meta['columns'][column]['type'] = "float" else: meta['columns'][column]['type'] = "int" else: # Infer meta from data if data is not None: # print "VAR '{}' NOT IN value_labels".format(column) column_values = data[column].dropna() if len(column_values) > 0: # Get the first "not nan" value from the column value = column_values.values[0] if isinstance(value, pd.np.float64): # Float AND Int because savReaderWriter loads them both as float64 meta['columns'][column]['text'] = { text_key: [column] } meta['columns'][column]['type'] = "float" if (data[column].dropna() % 1).sum() == 0: if (data[column].dropna() % 1).unique() == [0]: try: data[column] = data[column].astype( 'int') except: pass meta['columns'][column]['type'] = "int" elif isinstance(value, str) or isinstance( value, str): # Strings meta['columns'][column]['text'] = { text_key: [column] } meta['columns'][column]['type'] = "string" if column in metadata.varTypes: pass if column in metadata.varSets: pass if column in metadata.varAttributes: pass if column in metadata.varRoles: pass if column in metadata.measureLevels: pass # Some labels are empty strings.note if column in metadata.varLabels: meta['columns'][column]['text'] = { text_key: metadata.varLabels[column] } for mrset in metadata.multRespDefs: # meta['masks'][mrset] = {} # 'D' is "multiple dichotomy sets" in SPSS # 'C' is "multiple category sets" in SPSS varNames = list(metadata.multRespDefs[mrset]['varNames']) # Find the index where there delimited set should be inserted # into data, which is immediately prior to the start of the # dichotomous set columns dls_idx = data.columns.tolist().index(varNames[0]) if metadata.multRespDefs[mrset]['setType'] == 'C': # Raise if value object of columns is not equal if not all(meta['columns'][v]['values'] == meta['columns'][ varNames[0]]['values'] for v in varNames): msg = 'Columns must have equal values to be combined in a set: {}' raise ValueError(msg.format(varNames)) # Concatenate columns to set df_str = data[varNames].astype('str') dls = df_str.apply(lambda x: ';'.join([ v.replace('.0', '') for v in x.tolist() if not v in ['nan', 'None'] ]), axis=1) + ';' dls.replace({';': np.NaN}, inplace=True) # Get value object values = meta['columns'][varNames[0]]['values'] elif metadata.multRespDefs[mrset]['setType'] == 'D': # Generate the delimited set from the dichotomous set dls = condense_dichotomous_set(data[varNames], values_from_labels=False, **dichot) # Get value object values = [{ 'text': { text_key: metadata.varLabels[varName] }, 'value': int(v) } for v, varName in enumerate(varNames, start=1)] else: continue # Insert the delimited set into data data.insert(dls_idx, mrset, dls) # Generate the column meta for the new delimited set meta['columns'][mrset] = { 'name': mrset, 'type': 'delimited set', 'text': { text_key: metadata.multRespDefs[mrset]['label'] }, 'parent': {}, 'values': values } # Add the new delimited set to the 'data file' set df_items = meta['sets']['data file']['items'] df_items.insert(df_items.index('columns@{}'.format(varNames[0])), 'columns@{}'.format(mrset)) data = data.drop(varNames, axis=1) for varName in varNames: df_items.remove('columns@{}'.format(varName)) del meta['columns'][varName] return meta, data