def getmetadata(variables, missing): """Return dictionaries of value labels and missing values variables is a list of variable names to process missing specifies the missing value treatment""" # vldict is a dictionary indexed by variable name with each entry # a set of labelled values # missingsdict is a dictionary indexed by variable name with each entry # a set of missing values. Range specifictions are silently ignored. # if missing == "exclude", missing values are removed from the labels set vldict = {} missingsdict = {} with spss.DataStep(): ds = spss.Dataset() for v in variables: vldict[v] = set(ds.varlist[v].valueLabels.data.keys()) mvs = ds.varlist[v].missingValues if mvs[0] < 0: mvs = [mvs[3]] else: mvs = mvs[1:] missingsdict[v] = set([item for item in mvs if item is not None]) if missing == "exclude": vldict[v] = vldict[v] - missingsdict[v] return vldict, missingsdict
def __init__(self): ##self.vardict = spssaux.VariableDict() spss.StartDataStep() self.ds = spss.Dataset() self.varlist = self.ds.varlist self.mrsets = {} # the api always returns the set name in upper case for name, theset in self.ds.multiResponseSet.data.iteritems(): self.mrsets[name.upper()] = theset
def __init__(self): try: spss.StartDataStep() except: spss.Submit("EXECUTE.") spss.StartDataStep() self.ds = spss.Dataset() self.varlist = self.ds.varlist self.mrsets = {} # the api always returns the set name in upper case for name, theset in self.ds.multiResponseSet.data.items(): self.mrsets[name.upper()] = theset
def addinfo(filespec): """open the file if appropriate type, extract variable information, and add it to dataset dsname. filespec is the file to open dsname is the dataset name to append to filetypes is the list of file types to include.""" fnsplit = os.path.split(filespec)[1] fn, ext = os.path.splitext(fnsplit) for ft in filetypes: if ext in ftdict[ft]: if pat is None or pat.match(fn): try: spss.Submit(spsscmd[ft] % filespec) spss.Submit("DATASET NAME @__GATHERMD__.") except: if not isinstance(filespec, str): filespec = str(filespec, encoding) raise EnvironmentError(_("File could not be opened, skipping: %s") % filespec) break else: return addinfo with DataStep(): ds = spss.Dataset(name=dsname) # not the active dataset dssource = spss.Dataset(name="*") # The dataset to examine numvars = spss.GetVariableCount() # active dataset variables = dssource.varlist for v in range(numvars): lis = [filespec.replace("\\","/"), spss.GetVariableName(v), spss.GetVariableLabel(v)] lis.extend(blanks) lis = [item+ 256*" " for item in lis] ds.cases.append(lis) #ds.cases.append([filespec.replace("\\","/"), spss.GetVariableName(v), spss.GetVariableLabel(v), *blanks]) if includeAttrs: attrs = variables[v].attributes.data for a in attrs: if a.lower() in attrindexes: ds.cases[-1, attrindexes[a.lower()]+ 3] = attrs[a][0] + attrlength * " "# allow for standard variables spss.Submit("DATASET CLOSE @__GATHERMD__.")
def getvalues(num, denom, id, dsname): """return vectors of num. denom, and id values from constants in syntax or variable values""" if isname(num[0]) or isname(denom[0]) or isname(id): spss.StartDataStep() ds = spss.Dataset(dsname) else: ds = None id = [id] try: vallist = [] if ds: vl = [v.name.lower() for v in ds.varlist] # variables in the dataset for v in num, denom, id: try: vallist.append([float(val) for val in v]) except: #variable name argument or None if v[0] is None: # can only happen with id variable vallist.append([None]) # null label in case no id variable else: if len(v) > 1: raise ValueError( "Error: Only one variable may be named on each of NUM, DENOM, and ID, and a variable may not be combined with a value: " + " ".join(v)) try: vindex = vl.index(v[0].lower()) vallist.append([val[vindex] for val in ds.cases]) except: raise ValueError( "Error: An undefined variable name was specified in NUM, DENOM, or ID: " + " ".join(v)) finally: spss.EndDataStep() # check and fix value list lengths maxlen = max([len(vl) for vl in vallist]) for i in range(len(vallist)): if len(vallist[i]) == 1: vallist[i] = maxlen * vallist[i] if len(vallist[i]) != maxlen: raise ValueError( "Error: NUM, DENOM and optional ID do not all have the same number of items" ) return vallist
def genSetsCategoryList(mcset, allvars, resolver, setname, varprefix): """Generate sorted list(s) of values with possible insertion of extra values and create SPSS macros. mcset is the mc set to convert allvars is the resolved list of variables in the sets resolver is a class that contains the MR set information from the SPSS dictionary. setname is the name for the output set varprefix is the prefix for variable names to generate""" if resolver.getSetType(mcset) != "Categories": raise ValueError( _("""The specified set is not a multiple category set. Only a set of that type can be used in this procedure: %s""" ) % mcset) curs = spssdata.Spssdata( indexes=allvars, names=False) # keep cases w missing, mv's set to None nvar = len(allvars) vvalues = set() for case in curs: for i in range(nvar): if not case[i] is None: # omit sysmis and user missing values if resolver.getVarType(mcset) == "String": val = case[i].rstrip() else: val = case[i] vvalues.add(val) curs.CClose() if len(vvalues) == 0: raise ValueError( _("""There are no values in the set variables for set: %s""" % mcset)) # copy values labels from the first variable in the set # MC sets are expected to have consistent value labels across variable # if any are defined. with spss.DataStep(): valuelabels = spss.Dataset().varlist[allvars[0]].valueLabels.data manager = ManageValues(resolver, mcset, vvalues, setname, varprefix, valuelabels) manager.genData() manager.setgen() return (manager.generatednames, manager.generatedvalues, manager.generatedlabels)
def metadata(datain, path): f = open(path + ".met", "w") # open the metadata file f.write("standard;\n") f.write("variables\n") # write the variable metadata if not re.match(r"\.sav$", datain, flags=re.IGNORECASE): # get datain datain += ".sav" # add .sav spss.Submit("get file='{0}'.".format(datain)) spss.StartDataStep() ds = spss.Dataset() type = -1 frames = 0 for var in ds.varlist: line = " name={0}".format(var.name) # name if var.label: line += ' label="{0}"'.format(var.label.replace('"', '"')) # label if var.type != type: if var.type == 0: # type and width line += " type=float width=8" else: line += " type=char width={0}".format(var.type) type = var.type if var.valueLabels: # codeframe line += ' codeframe="{0}"'.format(var.name) frames = 1 line += ";\n" f.write(line) if frames: # write the codeframe metadata f.write("codeframes\n") for var in ds.varlist: if var.valueLabels: f.write(" name={0}\n".format(var.name)) for val, lab in var.valueLabels.data.iteritems(): f.write(' {0} = "{1}"\n'.format(val, lab)) f.write(" ;\n") ds.close() spss.EndDataStep() f.close() return 0
def anon(varnames, nameroot=None, svalueroot='', method='sequential', seed=None, offset=None, scale=None, maxrvalue=None, onetoone=None, namemapping=None, valuemapping=None, mapping=None, ignorethis=None): """Anonymize the specified variables varnames is the list of input variables. nameroot, if specified, is used as a prefix to rename variables with a numerical suffix. svalueroot, if specified, gives a prefix to be prepended to transformed values of string variables. method = 'sequential' (default), 'random', or 'transform'. seed, if specified, is used to initialize the random number generator offset and scale, required if method=transform, are the parameters for a linear transform of the values. If specified for a string variable , sequential is substituted. System-missing values are left as sysmis. maxrvalue is the maximum value for the random method. Must be positive. Only applies to random method. Can be one value for all variables or a list the size of the variable list with variable-specific values onetoone is an option list of variable names, a subset of varnames, for which mapped values must be unique. Applies only to method random. If 1-1 mapping cannot be found, an exception is raised. namemapping and valuemapping determine whether files with tables of results are saved. mapping names a file written as valuemapping to be used to initialize random mappings. """ with DataStep(): ds = spss.Dataset() allvariables = ds.varlist varnums = [allvariables[v].index for v in varnames] numvars = len(varnums) if maxrvalue is None: maxrvalue = [9999999] if len(maxrvalue) == 1: maxrvalue = numvars * maxrvalue if len(maxrvalue) != numvars: raise ValueError( "The number of values for maxrvalue is different from the number of variables" ) if onetoone is None: onetoone = [] onetoone = set([allvariables[v].index for v in onetoone]) if not onetoone.issubset(set(varnums)): raise ValueError( "A variable is listed in ONETOONE that is not in the VARIABLES list" ) if seed: random.seed(seed) trflist = [ Tvar(allvariables[vn], svalueroot, method, offset, scale, maxrvalue[i], vn in onetoone) for i, vn in enumerate(varnums) ] mapinputs(trflist, mapping) #initialize mappings if input mapping given todo = list(zip(varnums, trflist)) for i, case in enumerate(ds.cases): for vnum, t in todo: ds.cases[i, vnum] = t.trf(case[vnum]) # remove now irrelevant value labels and missing value codes for vn in varnums: allvariables[vn].valueLabels = {} allvariables[vn].missingValues = (0, None, None, None) # rename variables if requested # first find a number that guarantees no name conflicts. if nameroot: basenum = 0 pat = re.compile(r"%s(\d+)$" % nameroot, re.IGNORECASE) for v in allvariables: try: vnum = re.match(pat, v.name).group(1) basenum = max(basenum, int(vnum)) except: pass basenum += 1 if namemapping: f = codecs.open(namemapping, "w", encoding="utf_8_sig") for vn in varnums: newname = nameroot + str(basenum) if len(newname) > 64: raise ValueError( "A replacement variable name is too long: %s" % newname) if namemapping: f.write("%s = %s%s" % (allvariables[vn].name, newname, lineend)) allvariables[vn].name = newname basenum += 1 if namemapping: f.close() print("Variable name mappings written to file: %s" % namemapping) ds.close() # write file of value mappings for each mapped variable in csv format if valuemapping: #f = codecs.open(valuemapping, "w", encoding="utf_8_sig") #csvout = csv.writer(f) f = file(valuemapping, "w") csvout = UnicodeWriter(f) for t in trflist: t.write(csvout) f.close() print("Value mappings written to file: %s" % valuemapping)
def PCA(StandardizedPCAInput, varList, regionId): """ Use SPSS python api to perform PCA Arguments: PCAInput - 2d python list for PCA input varList - a list of variables for each columns in the PCA input Returns: CorrelationMatrix - Correlation matrix KMO - Kaiser-Mayer-Olkin value Bartlett_sig - Significance value of Bartlett's Sphericity Test Communalities - Communalities of extracted components VarExplainedInfo - Variance explained from unrotated solution, including absolute variance, % of variance, and cummulative % RotatedVarExplainedInfo - Rotated variance explained from unrotated solution, including absolute variance, % of variance, and cummulative % ComponentMatrix - Unrotated component loading matrix RotatedComponentMatrix - Rotated component loading matrix ComponentScoreCoefficientMatrix - Component score coefficient derived from rotated solution ComponentScore - Component score derived from score coefficient """ # SPSS command & dataset setup spss.Submit("NEW FILE") with spss.DataStep(): datasetObj = spss.Dataset() for var in varList: datasetObj.varlist.append(var) for row in StandardizedPCAInput: datasetObj.cases.append(row) if regionId == 18: debugFileOutputDir = r'C:\Users\hxiong\Dropbox\Haoyi Vulnerability\Simulation\Hurricane_Sandy' np.savetxt(np.array(StandardizedPCAInput), r'%s\PCAInut_r%d' % regionId, fmt='%.7f') spssPCASyntax = """FACTOR /VARIABLES {0} /MISSING LISTWISE /ANALYSIS {0} /PRINT UNIVARIATE INITIAL CORRELATION KMO EXTRACTION ROTATION FSCORE /CRITERIA MINEIGEN(1) ITERATE(25) /EXTRACTION PC /CRITERIA ITERATE(100) /ROTATION VARIMAX /SAVE REG(ALL) /METHOD=CORRELATION.""".format(' '.join(varList)) spss.SetOutput("off") varNum = len(varList) # Create XML output from SPSS tag = spssaux.CreateXMLOutput(spssPCASyntax, omsid='Factor Analysis') # Get correlation matrix CorrelationMatrix = spssaux.getValuesFromXmlWorkspace(tag, 'Correlation Matrix', cellAttrib="number") CorrelationMatrix = _spssOutputTableConversion(CorrelationMatrix, varNum, varNum) # Get KMO and Bartlett Plot_test sig. KMO_and_Bartlett = spssaux.getValuesFromXmlWorkspace( tag, 'KMO and Bartlett Test', cellAttrib="number") KMO_and_Bartlett = _spssOutputTableConversion(KMO_and_Bartlett, 1) NonpositiveDefiniteCorM = False KMO = 0. Bartlett_sig = 0. if (len(KMO_and_Bartlett) == 0): NonpositiveDefiniteCorM = True else: KMO = KMO_and_Bartlett[0] Bartlett_sig = KMO_and_Bartlett[3] # Get Communalities Communalities = spssaux.getValuesFromXmlWorkspace(tag, 'Communalities', colCategory="Extraction", cellAttrib="number") Communalities = _spssOutputTableConversion(Communalities, 1) # Get variances explained in unrotated solution VarExplained = spss.EvaluateXPath( tag[0], "/outputTree", """//pivotTable//category[@text="Extraction Sums of Squared Loadings"]/dimension["Statistics"]/category[@text="Total"]/cell/@number""" ) PctVarExplained = spss.EvaluateXPath( tag[0], "/outputTree", """//pivotTable//category[@text="Extraction Sums of Squared Loadings"]/dimension["Statistics"]/category[@text="% of Variance"]/cell/@number""" ) CummulativePctVarExplained = spss.EvaluateXPath( tag[0], "/outputTree", """//pivotTable//category[@text="Extraction Sums of Squared Loadings"]/dimension["Statistics"]/category[@text="Cumulative %"]/cell/@number""" ) VarExplained = _spssOutputTableConversion(VarExplained, 1) PctVarExplained = _spssOutputTableConversion(PctVarExplained, 1) CummulativePctVarExplained = _spssOutputTableConversion( CummulativePctVarExplained, 1) VarExplainedInfo = [ VarExplained, PctVarExplained, CummulativePctVarExplained ] # Get variances explained in rotated solution RotatedVarExplained = spss.EvaluateXPath( tag[0], "/outputTree", """//pivotTable//category[@text="Rotation Sums of Squared Loadings"]/dimension["Statistics"]/category[@text="Total"]/cell/@number""" ) RotatedPctVarExplained = spss.EvaluateXPath( tag[0], "/outputTree", """//pivotTable//category[@text="Rotation Sums of Squared Loadings"]/dimension["Statistics"]/category[@text="% of Variance"]/cell/@number""" ) RotatedCummulativePctVarExplained = spss.EvaluateXPath( tag[0], "/outputTree", """//pivotTable//category[@text="Rotation Sums of Squared Loadings"]/dimension["Statistics"]/category[@text="Cumulative %"]/cell/@number""" ) RotatedVarExplained = _spssOutputTableConversion(RotatedVarExplained, 1) RotatedPctVarExplained = _spssOutputTableConversion( RotatedPctVarExplained, 1) RotatedCummulativePctVarExplained = _spssOutputTableConversion( RotatedCummulativePctVarExplained, 1) RotatedVarExplainedInfo = [ RotatedVarExplained, RotatedPctVarExplained, RotatedCummulativePctVarExplained ] # Get number of extracted components if (len(VarExplained) != len(RotatedVarExplained)): w = "Region %d: unrotated and rotated solution finds different number of component based on Kaiser Criterion." % regionId warnings.warn(w, RuntimeWarning) CompNum = len(VarExplained) ComponentScoreColumnIndex = [varNum + i for i in xrange(CompNum)] # Get component matrix ComponentMatrix = spssaux.getValuesFromXmlWorkspace(tag, 'Factor Matrix', cellAttrib="number") ComponentMatrix = _spssOutputTableConversion(ComponentMatrix, CompNum, varNum) # Get rotated component matrix RotatedComponentMatrix = spssaux.getValuesFromXmlWorkspace( tag, 'Rotated Factor Matrix', cellAttrib="number") RotatedComponentMatrix = _spssOutputTableConversion( RotatedComponentMatrix, CompNum, varNum) # Get component score coefficient matrix ComponentScoreCoefficientMatrix = spssaux.getValuesFromXmlWorkspace( tag, 'Factor Score Coefficient Matrix', cellAttrib="number") ComponentScoreCoefficientMatrix = _spssOutputTableConversion( ComponentScoreCoefficientMatrix, CompNum, varNum) # Get component score dataCursor = spss.Cursor(ComponentScoreColumnIndex) ComponentScore = dataCursor.fetchall() dataCursor.close() return CorrelationMatrix, NonpositiveDefiniteCorM, KMO, Bartlett_sig, Communalities, VarExplainedInfo, RotatedVarExplainedInfo, ComponentMatrix, RotatedComponentMatrix, ComponentScoreCoefficientMatrix, ComponentScore
def dopropor(num=None, denom=None, id=None, dsname="*", alpha=.05, adjust='bonferroni'): if num is None or denom is None: raise ValueError("Error: NUM and DENOM keywords are required") if spss.PyInvokeSpss.IsUTF8mode(): unistr = str else: unistr = str currentds = spss.ActiveDataset() if currentds == "*": currentds = "S" + str(random.uniform(0, 1)) spss.Submit("DATASET NAME %s" % currentds) dsnamed = True else: dsnamed = False numvec, denomvec, idvec = getvalues(num, denom, id, dsname) # clean data, discard missing droplist = [] for i in range(len(numvec)): droplist.append(numvec[i] is not None and denomvec[i] is not None) #missing data if (droplist[i] and (numvec[i] > denomvec[i] or denomvec[i] <= 0)): raise ValueError( "Error: NUM value greater than DENOM value or zero denominator: %s, %s" % (numvec[i], denomvec[i])) for lis in numvec, denomvec, idvec: lis = [x for f, x in zip(droplist, lis) if f] #prune missing values if len(numvec) == 0: raise ValueError("Error: No valid proportions were found to analyze") alphalow = alpha / 2 alphahigh = 1 - alphalow dotest = len(numvec) > 1 try: spss.StartDataStep() #TODO: pending transformations except: spss.Submit("EXECUTE") spss.StartDataStep() # calculate ci's via SPSS IDFs ds = spss.Dataset(name=None) spss.SetActive(ds) ds.varlist.append("p", 0) ds.varlist.append("num", 0) ds.varlist.append("denom", 0) p0 = numvec[0] / denomvec[0] sdvec = [] for i in range(len(numvec)): p1 = numvec[i] / denomvec[i] sdvec.append( sqrt(p0 * (1 - p0) / denomvec[0] + p1 * (1 - p1) / denomvec[i])) #p = (numvec[i] + numvec[0]) / (denomvec[i] + denomvec[0]) #z = (p1 - p0)/sqrt(p * (1 - p)*(1/denomvec[0] + 1/denomvec[i])) ds.cases.append([p1, numvec[i], denomvec[i]]) spss.EndDataStep() cmd =r"""COMPUTE PLOWBI = IDF.BETA(%(alphalow)s, num + .5, denom-num + .5). COMPUTE PHIGHBI = IDF.BETA(%(alphahigh)s, num + .5, denom - num + .5). DO IF num > 0. COMPUTE PLOWPOIS = (IDF.CHISQ(%(alphalow)s, 2*num)/2)/denom. ELSE. COMPUTE PLOWPOIS = 0. END IF. COMPUTE PHIGHPOIS = (IDF.CHISQ(%(alphahigh)s, 2*(num+1))/2) / denom. COMPUTE ZTAIL = IDF.NORMAL(%(alphahigh)s, 0,1). EXECUTE."""\ % {"alphalow": alphalow, "alphahigh": alphahigh} spss.Submit(cmd) plowbi = [] phighbi = [] plowpois = [] phighpois = [] spss.StartDataStep() ds = spss.Dataset(name="*") for case in ds.cases: i = 3 for v in plowbi, phighbi, plowpois, phighpois: v.append(case[i]) i += 1 zalpha2 = case[-1] try: closeafter = False spss.SetActive(spss.Dataset(name=currentds)) except: closeafter = True ds.close() spss.EndDataStep() from spss import CellText spss.StartProcedure("Proportions") table = spss.BasePivotTable("Proportion Confidence Intervals", "Proportions") titlefootnote = "Alpha = %.3f" % alpha if 0. in numvec: titlefootnote += " (One-sided %.3f when p = 0)" % (alpha / 2.) table.TitleFootnotes(titlefootnote) rowdim = table.Append(spss.Dimension.Place.row, "Proportions") coldim = table.Append(spss.Dimension.Place.column, "Statistics") cols = [ "p", "Binomial\nLower CI", "Binomial\nUpper CI", "Poisson\nLower CI", "Poisson\nUpper CI", "Difference\nfrom p0", "Difference from p0\nLower CI", "Difference from p0\nUpper CI" ] table.SetCategories(coldim, [CellText.String(v) for v in cols]) idvec = [ not v is None and unistr(v) or unistr(i + 1) for i, v in enumerate(idvec) ] table.SetCategories(rowdim, [CellText.String(v) for v in idvec]) for i in range(len(numvec)): p1 = numvec[i] / denomvec[i] if i > 0: zdifflow = p1 - p0 - sdvec[i] * zalpha2 zdiffhigh = p1 - p0 + sdvec[i] * zalpha2 else: zdifflow = zdiffhigh = 0. table.SetCellsByRow(CellText.String(idvec[i]), [ CellText.Number(v) for v in (numvec[i] / denomvec[i], plowbi[i], phighbi[i], plowpois[i], phighpois[i], p1 - p0, zdifflow, zdiffhigh) ]) if i == 0: table[(CellText.String(idvec[0]), CellText.String(cols[-3]))] = CellText.String("-") table[(CellText.String(idvec[0]), CellText.String(cols[-2]))] = CellText.String("-") table[(CellText.String(idvec[0]), CellText.String(cols[-1]))] = CellText.String("-") spss.EndProcedure() if closeafter: spss.Submit(r"""NEW FILE. DATASET NAME %s.""" % "S" + str(random.uniform(0, 1)))
import spss, spssaux print(spss.__version__) spss.Submit( "get file='C:\\Users\\sam\\Desktop\\Data202201119\\20201119_1047.sav'.") spss.StartDataStep() myDataset = spss.Dataset() myVarlist = myDataset.varlist print(len(myVarlist)) print(myVarlist) for i in range(len(myVarlist)): print(str(i) + "--->" + spssaux.GetVariableNamesList()[i])
def gather(files, filetypes=["spss"], filenamepattern=None, dsname=None,attrlist=[], attrlength=256): """Create SPSS dataset listing variable names, variable labels, and source files for selected files. Return the name of the new dataset. files is a list of files and/or directories. If an item is a file, it is processed; if it is a directory, the files and subdirectories it contains are processed. filetypes is a list of filetypes to process. It defaults to ["spss"] which covers sav and por. It can also include "sas" for sas7bdat, sd7, sd2, ssd01, and xpt, and "stata" for dta filenamepattern is an optional parameter that can contain a regular expression to be applied to the filenames to filter the datasets that are processed. It is applied to the filename itself, omitting any directory path and file extension. The expression is anchored to the start of the name and ignores case. dsname is an optional name to be assigned to the new dataset. If not specified, a name will be automatically generated. If dsname is specified, it will become the active dataset; otherwise, it need not be the active dataset. attrlist is an optional list of custom attributes to be included in the output. For array attributes, only the first item is recorded. The value is blank if the attribute is not present for the variable. Attribute variables are strings of size attrlength bytes, truncated appropriately. The output is just a dataset. It must be saved, if desired, after this function has completed. Its name is the return value of this function. Exception is raised if any files not found. Examples: gathermetadata.gather(["c:/temp/firstlevel", "c:/spss16/samples/voter.sav"], ["spss", "sas"]) searches spss and sas files in or under the temp/firstlevel directory plus the voter file. gathermetadata.gather(["c:/temp/firstlevel"], filenamepattern="car") searches the firstlevel directory for spss files whose names start with "car". """ encoding = locale.getlocale()[1] filetypes = [f.lower() for f in filetypes] for ft in filetypes: if not ft in ["spss", "sas", "stata"]: raise ValueError(_("Filetypes must be one or more of spss, sas, and stata.")) dsvars = {"source":"source", "variablename":"VariableName", "variablelabel":"variableLabel"} with DataStep(): ds = spss.Dataset(name=None) dsn = ds.name varlist = ds.varlist varlist.append("source",200) varlist["source"].label=_("File containing the variable") varlist.append("variableName", 64) varlist["variableName"].label = _("Variable Name") varlist.append("variableLabel", 256) varlist["variableLabel"].label = _("Variable Label") attrindexes = {} for i, aname in enumerate(attrlist): anamemod = addunique(dsvars, aname) varlist.append(dsvars[anamemod], attrlength) attrindexes[aname.lower()] = i addvarinfo = makeaddinfo(dsn, filetypes, filenamepattern, dsvars, attrindexes, attrlength) #factory function files = [fixescapes(f) for f in files] #UP is converting escape characters :-) # walk the list of files and directories and open try: # will fail if spssaux is prior to version 2.3 fh = spssaux.FileHandles() except: pass notfound = [] for item in files: try: item = fh.resolve(item) except: pass if os.path.isfile(item): addvarinfo(item) elif os.path.isdir(item): for dirpath, dirnames, fnames in os.walk(item): for f in fnames: try: addvarinfo(os.path.join(dirpath, f)) except EnvironmentError as e: notfound.append(e.args[0]) else: if not isinstance(item, str): item = str(item, encoding) notfound.append(_("Not found: %s") % item) spss.Submit("DATASET ACTIVATE %s." % dsn) if not dsname is None: spss.Submit("DATASET NAME %s." % dsname) dsn = dsname if notfound: raise ValueError("\n".join(notfound)) return dsn
def genVarsCategoryList(varnames, specialvalues, macroname, missing, order, weightvar, specialsorder, valuelabelsdict, missingvaluesdict, customattr, attrname): """Generate sorted list(s) of values with possible insertion of extra values and return list of SPSS macros to be created. varnames is a sequence of variable names to process. specialvalues is a sequence of values that should be inserted before the first zero count or at the end if no zeros or None. If a special value already occurs in a varname, it will be moved. macroname is a list of macronames of the same length as varnames to generate or None. missing is 'include' or 'exclude' to determine whether user missing values are included or excluded. order is 'a' or 'd' to specify the sort direction. weightvar can be specified as a variable name to be used as a weight in determing the counts to sort by. It must not occur in varnames. specialsorder is 'before' or 'after' and indicates the location of the specials section If other, values that have value labels are appended to the list of values found in the data. customattr indicates whether a custom attribute with the order should be generated attrname is the name of the custom attribute This function is mainly useful as a helper function for Ctables in building CATEGORIES subcommands. It may be useful to combine it with OTHERNM and/or MISSING in the category list. """ if weightvar: if weightvar in varnames: raise ValueError(_("""The weight variable cannot be included as a variable.""")) varnamesAndWeight = varnames + [weightvar] else: varnamesAndWeight = varnames curs = spssdata.Spssdata(indexes=varnamesAndWeight, names=False, omitmissing=missing =='exclude') nvar = len(varnames) vvalues=[{} for i in range(nvar)] # for accumulating counts for all variable values for cn, case in enumerate(curs): casecpy = copy.copy(case) if weightvar: w = casecpy[nvar] if w is None: w = 0.0 else: w = 1.0 for i in range(nvar): if not casecpy[i] is None: # omit sysmis values and optionally user missing values curval = casecpy[i] vvalues[i][curval] = vvalues[i].get(curval,0.) + w # count occurrences, possibly weighted curs.CClose() valuelist = [] macrosgenerated = [] customattrlist = [] for i, vname in enumerate(varnames): # if labeled values were supplied but did not occur in the data, # add them with a count of zero if not valuelabelsdict is None: labeledbutnotfound = valuelabelsdict[vname] - set(vvalues[i].keys()) for val in labeledbutnotfound: vvalues[i][val] = 0. if not specialvalues is None: # remove special values from count list for v in specialvalues: if v in vvalues[i]: del(vvalues[i][v]) valuelist.append(sorted([(value, key) for (key, value) in vvalues[i].iteritems()], reverse = order == 'd')) if not specialvalues is None: if specialsorder == "after": valuelist[i].extend([(None, v) for v in specialvalues]) else: valuelist[i] = [(None, v) for v in specialvalues] + valuelist[i] if isinstance(valuelist[i][0][1], basestring): qchar = '"' else: qchar = '' if macroname is not None: if not macroname[i].startswith("!"): macroname[i] = "!" + macroname[i] macrosgenerated.append([macroname[i], " ".join([qchar + strconv(k).rstrip() + qchar for (value, k) in valuelist[i]])]) if customattr: customattrlist.append([vname, " ".join([qchar + strconv(k).rstrip() + qchar for (value, k) in valuelist[i]])]) if customattr: try: # cannot start datastep if there are pending transformations spss.StartDataStep() except: spss.Submit("EXECUTE.") spss.StartDataStep() ds = spss.Dataset() for spec in customattrlist: ds.varlist[spec[0]].attributes[attrname] = spec[1] spss.EndDataStep() return macrosgenerated, customattrlist