def main(): print('reading..') lhd = read_csv_data.read_csv_data( '/LINUX23/home/bdb112/LHD_Summary_Long.csv', header=3) print('{k} keys, {n} entries read'.format(n=len(lhd['nShotnumber']), k=len(lhd.keys()))) # this is hacked in because I missed GAMMA and another in my big file lhd2 = read_csv_data.read_csv_data( '/home/bdb112/datamining/lhd_summary_data.csv', header=3) ksh = 'nShotnumber' ws2 = np.where(lhd[ksh] != lhd2[ksh])[0] if len(ws2) != 0: raise LookupError('{n} mismatched shots'.format(n=len(ws2))) # if we already have the key, give this one a different name -otherwise same for k in lhd2.keys(): if k in lhd.keys(): lhd[k + '1'] = lhd2[k] else: lhd[k] = lhd2[k] """ Do it simply, not necessarily efficiently (after wasting 3 hours doing it efficiently) First delete all records with blank shot numbers by copying to tmp Then convert shot to int, and reorder everything to shot order Then create the final shot array, indexed by shot (must be equal or bigger len) The target address in the final array is just the shot coulum (sht) in the tmp Then for each column, find non blanks (wnn) Prepare a target arrlen array of the right type, with nan entries (or -1, '') depost them target[sht[wnn]] = col[wnn] Finally, the shot column in the final array (Shot) should be == arange(maxshot+1) """ LHD = {} tmp = {} sh = 90091 err = 0 str_summary = [] wnotnull = np.where( lhd['nShotnumber'] != '')[0] # cautiously convert to int shots_tmp = lhd['nShotnumber'][wnotnull].astype(np.int32) # need unique here, are there are 2 shot 100's ! (what does this mean?) shots_test, ws = np.unique(shots_tmp, return_index=1) # reorder the strings in a new dict, in shot number order. for k in lhd.keys(): tmp.update({k: lhd[k][ws]}) # now prepare the final shot array arrlen = np.max(shots_tmp) + 1 # need a spot for all shots including 0 shots = np.zeros(arrlen, dtype=np.int32) - 1 # initialise to shot=-1 shots[shots_tmp] = shots_tmp LHD.update({'Shot': shots}) for k in tmp.keys(): as_str_in_order = tmp[k] # now look for '' in other cols wcolnotnull = np.where(as_str_in_order != '')[0] chk_range = min(10, len(wcolnotnull)) # get a lot of values, in case the first choice is not representative values = '_'.join([ as_str_in_order[wcolnotnull[i]].strip() for i in range(chk_range) ]) if re.match('^[_0-9]*$', values): dt = 'int32' arr = -np.ones(arrlen).astype(dt) wdecimal = np.where( np.remainder(as_str_in_order[wcolnotnull].astype(float), 1) != 0)[0] if len(wdecimal) > 0: print('reverting {k} to float based on {eg}'.format( k=k, eg=as_str_in_order[wcolnotnull[wdecimal[0]]])) dt = 'float32' arr = np.nan + np.ones(arrlen).astype(dt) elif re.match('^[_+-.0-9eE]*$', values): dt = 'float32' arr = np.nan + np.ones(arrlen).astype(dt) else: dt == 'str' #arr = np.empty(arrlen,dtype='|S256') # need to initialise empty arr = np.array(arrlen * [''], dtype='|S256') try: # the conversion may go wrong - protect arr[shots_tmp[ws[wcolnotnull]]] = \ as_str_in_order[wcolnotnull].astype(np.dtype(dt)) except Exception, details: err += 1 print('Failed on {k} (type was based on "{v}" for shot {sh}, {d}'. format(k=k, d=details, v=values, sh=sh)) arr = np.array(arrlen * [''], dtype='|S256') #arr = np.empty(arrlen,dtype='|S256') #arr = np.array(arrlen*['']) arr[shots_tmp[ws[wcolnotnull]]] = as_str_in_order[wcolnotnull] # compress, but beware assignments in the future. arr = np.array([s.strip() for s in arr]) str_summary.append('{k}: {oldty}-> {dty}'.format( k=k, dty=arr.dtype, oldty=as_str_in_order.dtype)) print('revert {k} to a string, type {dty}'.format(k=k, dty=arr.dtype)) LHD.update({k: arr}) # add the new entry
def get_basic_params(diags=None, shot=54196, times=None, delay=None, debug=0): """ return a list of np.arrays of normally numeric values for the times given, for the given shot. """ global lhd_summary if diags is None: diags = "<n_e19>,b_0,i_p,w_p,dw_pdt,dw_pdt2".split(',') if delay is None: delay = get_delay(shot) if times is None: times = np.linspace(0,4,4000) times = np.array(times) vals = {} # create an extra time array to allow a cross check vals.update({'check_tm':times}) vals.update({'check_shot':np.zeros(len(times),dtype=np.int)+shot}) for diag in diags: if diag not in file_info: warn('diagnostic {0} not found in shot {1}'.format(diag, shot),stacklevel=2) vals.update({diag: np.nan + times}) else: info = file_info[diag] varname = info['name'] if ':' in varname: (oper,varname) = varname.split(':') else: oper = None if info['format'].find('.csv') > 0: try: test=lhd_summary.keys() except: print('reloading {0}'.format(info['format'])) lhd_summary = read_csv_data(acq_LHD+info['format'], header=3) val = lhd_summary[varname][shot] valarr = np.double(val)+(times*0) else: try: dg = igetfile(local_dir + info['format'], shot=shot) except IOError: try: dg = igetfile(local_dir + info['format']+'.bz2', shot=shot) except IOError: try: dg = igetfile(local_dir + info['format']+'.gz', shot=shot) except exception: #debug_(1) dg=None #break # give up and try next diagnostic if dg is None: # messy - break doesn't do what I want? valarr=None else: nd=dg.vardict['DimNo'] if nd != 1: raise ValueError( 'Expecting a 1 D array in {0}, got {1}!' .format(dg.filename, nd)) # pre re. w = np.where(np.array(dg.vardict['ValName'])==varname)[0] matches = [re.match(varname,nam) != None for nam in dg.vardict['ValName']] w = np.where(np.array(matches) != False)[0] if len(w) != 1: raise LookupError( 'Need just one instance of variable {0} in {1}'. format(varname, dg.filename)) # get the column of the array corresponding to the name valarr = dg.data[:,nd+w[0]] tim = dg.data[:,0] - delay if oper == 'ddt': # derivative operator valarr = np.diff(valarr)/(np.average(np.diff(tim))) tim = (tim[0:-1] + tim[1:])/2.0 if oper == 'ddt2': # abd(ddw)*derivative operator dw = np.diff(valarr)/(np.average(np.diff(tim))) ddw = np.diff(dw)/(np.average(np.diff(tim))) tim = tim[2:] valarr = 4e-6 * dw[1:] * np.abs(ddw) valarr = (stineman_interp(times, tim, valarr)) w = np.where(times > max(tim)) valarr[w] = np.nan if valarr != None: vals.update({diag: valarr}) debug_(max(pyfusion.DEBUG, debug), level=5, key='interp') return(vals)
""" read the LHD summary csv file and put in in a dictionary of arrays where the index is the shot number. This may require adding a "0" shot (apparently not as of Feb 2013. Where possible, integers and reals are converted, and the strings are reduced to the minimum length. (Note - this will cause errors if longer strings are added afterwards. """ from pyfusion.utils import read_csv_data import numpy as np import re hack_merge_another_file = False print('reading..') lhd = read_csv_data.read_csv_data('LHD_Summary_Long.csv', header=3) print('{k} keys, {n} entries read'.format(n=len(lhd['nShotnumber']), k=len(lhd.keys()))) # this is hacked in because I missed GAMMA and another in my big file if hack_merge_another_file: lhd2 = read_csv_data.read_csv_data( '/home/bdb112/datamining/lhd_summary_data.csv', header=3) ksh = 'nShotnumber' ws2 = np.where(lhd[ksh] != lhd2[ksh])[0] if len(ws2) != 0: raise LookupError('{n} mismatched shots'.format(n=len(ws2))) # if we already have the key, give this one a different name -otherwise same for k in lhd2.keys(): if k in lhd.keys(): lhd[k + '1'] = lhd2[k] else: lhd[k] = lhd2[k]
def get_basic_diagnostics(diags=None, shot=54196, times=None, delay=None, exception=False, debug=0): """ return a list of np.arrays of normally numeric values for the times given, for the given shot. Will access server if env('IGETFILE') points to an exe, else accesses cache """ global lhd_summary # if no exception given and we are not debugging # note - exception=None is a valid entry, meaning tolerate no exceptions # so the "default" we use is False if exception == False and debug == 0: exception = Exception if diags is None: diags = "<n_e19>,b_0,i_p,w_p,dw_pdt,dw_pdt2".split(',') if len(np.shape(diags)) == 0: diags = [diags] if delay is None: delay = get_delay(shot) if times is None: times = np.linspace(0, 4, 4000) times = np.array(times) vals = {} # create an extra time array to allow a cross check vals.update({'check_tm': times}) vals.update({'check_shot': np.zeros(len(times), dtype=np.int) + shot}) for diag in diags: if diag not in file_info: warn('diagnostic {0} not found in shot {1}'.format(diag, shot), stacklevel=2) vals.update({diag: np.nan + times}) else: info = file_info[diag] varname = info['name'] subfolder = info['format'].split('@')[0] filepath = os.path.sep.join( [localigetfilepath, subfolder, info['format']]) if ':' in varname: (oper, varname) = varname.split(':') else: oper = None if info['format'].find('.csv') > 0: try: test = lhd_summary.keys() except: csvfilename = acq_LHD + '/' + info['format'] if pyfusion.DBG() > 1: print('looking for lhd summary in' + csvfilename) if not os.path.exists(csvfilename): csvfilename += ".bz2" print('reloading {0}'.format(csvfilename)) lhd_summary = read_csv_data(csvfilename, header=3) # should make this more formal - last shots # from an 'extra' file, and finally, from shot info if shot > 117000: # fudge to get latest data lhd_summary = np.load(acq_LHD + '/LHD_summary.npz')['LHD'].tolist() print('loading newer shots from a separate file - fix-me') # val = lhd_summary[varname][shot-70000] # not needed # else: val = lhd_summary[varname][shot] valarr = np.double(val) + (times * 0) else: debug_(max(pyfusion.DBG(), debug), level=4, key='find_data') try: dg = igetfile(filepath, shot=shot, debug=debug - 1) except IOError: try: dg = igetfile(filepath + '.bz2', shot=shot, debug=debug - 1) except IOError: try: dg = igetfile(filepath + '.gz', shot=shot, debug=debug - 1) except exception as details: if debug > 0: print('diag at {fp} not found'.format( fp=filepath)) print(details, details.args) dg = None #break # give up and try next diagnostic if dg is None: # messy - break doesn't do what I want? valarr = None else: nd = dg.vardict['DimNo'] if nd != 1: raise ValueError( 'Expecting a 1 D array in {0}, got {1}!'.format( dg.filename, nd)) # pre re. w = np.where(np.array(dg.vardict['ValName'])==varname)[0] matches = [ re.match(varname, nam) != None for nam in dg.vardict['ValName'] ] w = np.where(np.array(matches) != False)[0] # get the column(s) of the array corresponding to the name if (oper in 'sum,average,rms,max,min'.split(',')): if oper == 'sum': op = np.sum elif oper == 'average': op = np.average elif oper == 'min': op = np.min elif oper == 'std': op = np.std else: raise ValueError( 'operator {o} in {n} not known to get_basic_diagnostics' .format(o=oper, n=info['name'])) valarr = op(dg.data[:, nd + w], 1) else: if len(w) != 1: raise LookupError( 'Need just one instance of variable {0} in {1}' .format(varname, dg.filename)) if len(np.shape(dg.data)) != 2: raise LookupError( 'insufficient data for {0} in {1}'.format( varname, dg.filename)) valarr = dg.data[:, nd + w[0]] tim = dg.data[:, 0] - delay if oper == 'ddt': # derivative operator valarr = np.diff(valarr) / (np.average(np.diff(tim))) tim = (tim[0:-1] + tim[1:]) / 2.0 if oper == 'ddt2': # abd(ddw)*derivative operator dw = np.diff(valarr) / (np.average(np.diff(tim))) ddw = np.diff(dw) / (np.average(np.diff(tim))) tim = tim[2:] valarr = 4e-6 * dw[1:] * np.abs(ddw) if (len(tim) < 10) or (np.std(tim) < 0.1): raise ValueError('Insufficient points or degenerate' 'timebase data in {0}, {1}'.format( varname, dg.filename)) valarr = (stineman_interp(times, tim, valarr)) w = np.where(times > max(tim)) valarr[w] = np.nan if valarr != None: vals.update({diag: valarr}) debug_(max(pyfusion.DBG(), debug), level=5, key='interp') return (vals)
def get_basic_diagnostics(diags=None, shot=54196, times=None, delay=None, exception=False, debug=0): """ return a list of np.arrays of normally numeric values for the times given, for the given shot. """ global lhd_summary # if no exception given and we are not debugging # note - exception=None is a valid entry, meaning tolerate no exceptions # so the "default" we use is False if exception==False and debug==0: exception=Exception if diags == None: diags = "<n_e19>,b_0,i_p,w_p,dw_pdt,dw_pdt2".split(',') if len(np.shape(diags)) == 0: diags = [diags] if delay == None: delay = get_delay(shot) if times == None: times = np.linspace(0,4,4000) times = np.array(times) vals = {} # create an extra time array to allow a cross check vals.update({'check_tm':times}) vals.update({'check_shot':np.zeros(len(times),dtype=np.int)+shot}) for diag in diags: if not(file_info.has_key(diag)): warn('diagnostic {0} not found in shot {1}'.format(diag, shot),stacklevel=2) vals.update({diag: np.nan + times}) else: info = file_info[diag] varname = info['name'] subfolder = info['format'].split('@')[0] filepath = os.path.sep.join([localigetfilepath,subfolder,info['format']]) if ':' in varname: (oper,varname) = varname.split(':') else: oper = None if info['format'].find('.csv') > 0: try: test=lhd_summary.keys() except: print('reloading {0}'.format(info['format'])) lhd_summary = read_csv_data(acq_LHD+'/'+info['format'], header=3) val = lhd_summary[varname][shot] valarr = np.double(val)+(times*0) else: debug_(max(pyfusion.DEBUG, debug), level=4, key='find_data') try: dg = igetfile(filepath, shot=shot, debug=debug-1) except IOError: try: dg = igetfile(filepath+'.bz2', shot=shot, debug=debug-1) except IOError: try: dg = igetfile(filepath + '.gz', shot=shot, debug=debug-1) except exception: if debug>0: print('diag at {fp} not found' .format(fp=filepath)) dg=None #break # give up and try next diagnostic if dg==None: # messy - break doesn't do what I want? valarr=None else: nd=dg.vardict['DimNo'] if nd != 1: raise ValueError( 'Expecting a 1 D array in {0}, got {1}!' .format(dg.filename, nd)) # pre re. w = np.where(np.array(dg.vardict['ValName'])==varname)[0] matches = [re.match(varname,nam) != None for nam in dg.vardict['ValName']] w = np.where(np.array(matches) != False)[0] # get the column(s) of the array corresponding to the name if (oper in 'sum,average,rms,max,min'.split(',')): if oper=='sum': op = np.sum elif oper=='average': op = np.average elif oper=='min': op = np.min elif oper=='std': op = np.std else: raise ValueError('operator {o} in {n} not known to get_basic_diagnostics' .format(o=oper, n=info['name'])) valarr = op(dg.data[:,nd+w],1) else: if len(w) != 1: raise LookupError( 'Need just one instance of variable {0} in {1}' .format(varname, dg.filename)) if len(np.shape(dg.data))!=2: raise LookupError( 'insufficient data for {0} in {1}' .format(varname, dg.filename)) valarr = dg.data[:,nd+w[0]] tim = dg.data[:,0] - delay if oper == 'ddt': # derivative operator valarr = np.diff(valarr)/(np.average(np.diff(tim))) tim = (tim[0:-1] + tim[1:])/2.0 if oper == 'ddt2': # abd(ddw)*derivative operator dw = np.diff(valarr)/(np.average(np.diff(tim))) ddw = np.diff(dw)/(np.average(np.diff(tim))) tim = tim[2:] valarr = 4e-6 * dw[1:] * np.abs(ddw) if (len(tim) < 10) or (np.std(tim)<0.1): raise ValueError('Insufficient points or degenerate' 'timebase data in {0}, {1}' .format(varname, dg.filename)) valarr = (stineman_interp(times, tim, valarr)) w = np.where(times > max(tim)) valarr[w] = np.nan if valarr != None: vals.update({diag: valarr}) debug_(max(pyfusion.DEBUG, debug), level=5, key='interp') return(vals)
def get_basic_diagnostics(diags=None, file_info=file_info, shot=54196, times=None, delay=None, exception=False, debug=0): """ return a list of np.arrays of normally numeric values for the times given, for the given shot. Will access server if env('IGETFILE') points to an exe, else accesses cache This is the first version to specifically allow for access through pyfusion.cfg There are two types of access: I/ single diag on its own timebase II/ the original multi diag on a given timebase (i.e. that from flucstrcs) Stage 1 puts the file_info into .cfg file just for I/ single diag access. Ideally the file_info for II/ sho;d be in .cfg also. For stage I/, we call it with a file_info dict constructed on the spot as a dictionary with one just entry (for diags[0]). """ global lhd_summary # if no exception given and we are not debugging # note - exception=None is a valid entry, meaning tolerate no exceptions # so the "default" we use is False if exception==False and debug==0: exception=Exception if diags is None: diags = "<n_e19>,b_0,i_p,w_p,dw_pdt,dw_pdt2".split(',') if len(np.shape(diags)) == 0: diags = [diags] if delay is None: delay = get_delay(shot) if times is None: if len(diags)>1: times = np.linspace(0,4,4000) # this is a crude guess. # else leave it None else: # make sure it is an array times = np.array(times) vals = {} for diag in diags: if not(diag in file_info): warn('diagnostic {0} not found in shot {1}'.format(diag, shot),stacklevel=2) vals.update({diag: np.nan + times}) else: info = file_info[diag] varname = info['name'] # refers to name for igetfile - can contain ':' subfolder = info['format'].split('@')[0] filepath = os.path.sep.join([localigetfilepath,subfolder,info['format']]) if ':' in varname: (oper,varname) = varname.split(':') else: oper = None if info['format'].find('.csv') > 0: try: test=list(lhd_summary.keys()) except: csvfilename = acq_LHD+'/'+info['format'] if pyfusion.DBG() > 1: print('looking for lhd summary in' + csvfilename) if not os.path.exists(csvfilename): csvfilename += ".bz2" print('reloading {0}'.format(csvfilename)) lhd_summary = read_csv_data(csvfilename, header=3) # should make this more formal - last shots # from an 'extra' file, and finally, from shot info if shot>117000: # fudge to get latest data lhd_summary = np.load(acq_LHD+'/LHD_summary.npz')['LHD'].tolist() print('loading newer shots from a separate file - fix-me') # val = lhd_summary[varname][shot-70000] # not needed # else: val = lhd_summary[varname][shot] valarr = np.double(val)+(times*0) else: try: # now igetfile checks for .gz etc dg = igetfile(filepath, shot=shot, debug=debug-1) except exception as details: if debug>0: print('diag at {fp} not found' .format(fp=filepath)) print(details,details.args) dg=None #break # give up and try next diagnostic if dg is None: # messy - break doesn't do what I want? valarr=None else: nd=dg.vardict['DimNo'] if nd != 1: raise ValueError( 'Expecting a 1 D array in {0}, got {1}!' .format(dg.filename, nd)) # pre re. w = np.where(np.array(dg.vardict['ValName'])==varname)[0] matches = [re.match(varname,nam) != None for nam in dg.vardict['ValName']] w = np.where(np.array(matches) != False)[0] # get the column(s) of the array corresponding to the name if (oper in 'sum,average,rms,max,min'.split(',')): if oper=='sum': op = np.sum elif oper=='average': op = np.average elif oper=='min': op = np.min elif oper=='std': op = np.std else: raise ValueError('operator {o} in {n} not known to get_basic_diagnostics' .format(o=oper, n=info['name'])) valarr = op(dg.data[:,nd+w],1) else: if len(w) != 1: raise LookupError( 'Need just one instance of variable {0} in {1}' .format(varname, dg.filename)) if len(np.shape(dg.data))!=2: raise LookupError( 'insufficient data for {0} in {1}' .format(varname, dg.filename)) valarr = dg.data[:,nd+w[0]] tim = dg.data[:,0] - delay if oper == 'ddt': # derivative operator valarr = np.diff(valarr)/(np.average(np.diff(tim))) tim = (tim[0:-1] + tim[1:])/2.0 if oper == 'ddt2': # abd(ddw)*derivative operator dw = np.diff(valarr)/(np.average(np.diff(tim))) ddw = np.diff(dw)/(np.average(np.diff(tim))) tim = tim[2:] valarr = 4e-6 * dw[1:] * np.abs(ddw) if (len(tim) < 10) or (np.std(tim)<0.1): raise ValueError('Insufficient points or degenerate' 'timebase data in {0}, {1}' .format(varname, dg.filename)) if times is not None: debug_(max(pyfusion.DEBUG, debug), level=5, key='interp') valarr = (stineman_interp(times, tim, valarr)) w = np.where(times > max(tim)) valarr[w] = np.nan else: times = tim if valarr is not None: vals.update({diag: valarr}) # create an extra time array to allow a cross check vals.update({'check_tm':times}) vals.update({'check_shot':np.zeros(len(times),dtype=np.int)+shot}) return(vals)
def get_basic_params(diags=None, shot=54196, times=None, delay=None, debug=0): """ return a list of np.arrays of normally numeric values for the times given, for the given shot. """ global lhd_summary if diags is None: diags = "<n_e19>,b_0,i_p,w_p,dw_pdt,dw_pdt2".split(',') if delay is None: delay = get_delay(shot) if times is None: times = np.linspace(0, 4, 4000) times = np.array(times) vals = {} # create an extra time array to allow a cross check vals.update({'check_tm': times}) vals.update({'check_shot': np.zeros(len(times), dtype=np.int) + shot}) for diag in diags: if diag not in file_info: warn('diagnostic {0} not found in shot {1}'.format(diag, shot), stacklevel=2) vals.update({diag: np.nan + times}) else: info = file_info[diag] varname = info['name'] if ':' in varname: (oper, varname) = varname.split(':') else: oper = None if info['format'].find('.csv') > 0: try: test = lhd_summary.keys() except: print('reloading {0}'.format(info['format'])) lhd_summary = read_csv_data(acq_LHD + info['format'], header=3) val = lhd_summary[varname][shot] valarr = np.double(val) + (times * 0) else: try: dg = igetfile(local_dir + info['format'], shot=shot) except IOError: try: dg = igetfile(local_dir + info['format'] + '.bz2', shot=shot) except IOError: try: dg = igetfile(local_dir + info['format'] + '.gz', shot=shot) except exception: #debug_(1) dg = None #break # give up and try next diagnostic if dg is None: # messy - break doesn't do what I want? valarr = None else: nd = dg.vardict['DimNo'] if nd != 1: raise ValueError( 'Expecting a 1 D array in {0}, got {1}!'.format( dg.filename, nd)) # pre re. w = np.where(np.array(dg.vardict['ValName'])==varname)[0] matches = [ re.match(varname, nam) != None for nam in dg.vardict['ValName'] ] w = np.where(np.array(matches) != False)[0] if len(w) != 1: raise LookupError( 'Need just one instance of variable {0} in {1}'. format(varname, dg.filename)) # get the column of the array corresponding to the name valarr = dg.data[:, nd + w[0]] tim = dg.data[:, 0] - delay if oper == 'ddt': # derivative operator valarr = np.diff(valarr) / (np.average(np.diff(tim))) tim = (tim[0:-1] + tim[1:]) / 2.0 if oper == 'ddt2': # abd(ddw)*derivative operator dw = np.diff(valarr) / (np.average(np.diff(tim))) ddw = np.diff(dw) / (np.average(np.diff(tim))) tim = tim[2:] valarr = 4e-6 * dw[1:] * np.abs(ddw) valarr = (stineman_interp(times, tim, valarr)) w = np.where(times > max(tim)) valarr[w] = np.nan if valarr != None: vals.update({diag: valarr}) debug_(max(pyfusion.DEBUG, debug), level=5, key='interp') return (vals)
def main(): print('reading..') lhd=read_csv_data.read_csv_data('/LINUX23/home/bdb112/LHD_Summary_Long.csv',header=3) print('{k} keys, {n} entries read'.format(n=len(lhd['nShotnumber']), k=len(lhd.keys()))) # this is hacked in because I missed GAMMA and another in my big file lhd2 = read_csv_data.read_csv_data('/home/bdb112/datamining/lhd_summary_data.csv',header=3) ksh='nShotnumber' ws2 = np.where(lhd[ksh] != lhd2[ksh])[0] if len(ws2) != 0: raise LookupError('{n} mismatched shots'.format(n=len(ws2))) # if we already have the key, give this one a different name -otherwise same for k in lhd2.keys(): if k in lhd.keys(): lhd[k+'1']=lhd2[k] else: lhd[k]=lhd2[k] """ Do it simply, not necessarily efficiently (after wasting 3 hours doing it efficiently) First delete all records with blank shot numbers by copying to tmp Then convert shot to int, and reorder everything to shot order Then create the final shot array, indexed by shot (must be equal or bigger len) The target address in the final array is just the shot coulum (sht) in the tmp Then for each column, find non blanks (wnn) Prepare a target arrlen array of the right type, with nan entries (or -1, '') depost them target[sht[wnn]] = col[wnn] Finally, the shot column in the final array (Shot) should be == arange(maxshot+1) """ LHD = {} tmp = {} sh = 90091 err=0 str_summary=[] wnotnull = np.where(lhd['nShotnumber'] != '')[0] # cautiously convert to int shots_tmp = lhd['nShotnumber'][wnotnull].astype(np.int32) # need unique here, are there are 2 shot 100's ! (what does this mean?) shots_test,ws = np.unique(shots_tmp, return_index=1) # reorder the strings in a new dict, in shot number order. for k in lhd.keys(): tmp.update({k: lhd[k][ws]}) # now prepare the final shot array arrlen = np.max(shots_tmp)+1 # need a spot for all shots including 0 shots = np.zeros(arrlen, dtype=np.int32) -1 # initialise to shot=-1 shots[shots_tmp] = shots_tmp LHD.update({'Shot': shots}) for k in tmp.keys(): as_str_in_order = tmp[k] # now look for '' in other cols wcolnotnull = np.where(as_str_in_order != '')[0] chk_range = min(10, len(wcolnotnull)) # get a lot of values, in case the first choice is not representative values = '_'.join([as_str_in_order[wcolnotnull[i]].strip() for i in range(chk_range)]) if re.match('^[_0-9]*$',values): dt = 'int32' arr = -np.ones(arrlen).astype(dt) wdecimal = np.where( np.remainder(as_str_in_order[wcolnotnull].astype(float),1)!=0)[0] if len(wdecimal)>0: print('reverting {k} to float based on {eg}' .format(k=k, eg=as_str_in_order[wcolnotnull[wdecimal[0]]])) dt = 'float32' arr = np.nan + np.ones(arrlen).astype(dt) elif re.match('^[_+-.0-9eE]*$',values): dt = 'float32' arr = np.nan + np.ones(arrlen).astype(dt) else: dt == 'str' #arr = np.empty(arrlen,dtype='|S256') # need to initialise empty arr = np.array(arrlen*[''],dtype='|S256') try: # the conversion may go wrong - protect arr[shots_tmp[ws[wcolnotnull]]] = \ as_str_in_order[wcolnotnull].astype(np.dtype(dt)) except Exception, details: err += 1 print('Failed on {k} (type was based on "{v}" for shot {sh}, {d}' .format(k=k, d=details, v = values, sh=sh)) arr = np.array(arrlen*[''],dtype='|S256') #arr = np.empty(arrlen,dtype='|S256') #arr = np.array(arrlen*['']) arr[shots_tmp[ws[wcolnotnull]]] = as_str_in_order[wcolnotnull] # compress, but beware assignments in the future. arr=np.array([s.strip() for s in arr]) str_summary.append('{k}: {oldty}-> {dty}' .format(k=k, dty=arr.dtype, oldty=as_str_in_order.dtype)) print('revert {k} to a string, type {dty}'.format(k=k, dty=arr.dtype)) LHD.update({k: arr}) # add the new entry