def get_scores(filename, competition_id): "Returns (preview_score, score)" regex = r'(.+),(\d+\.\d+|\d+)' # parse files predictions = np.fromregex(filename, regex, [('id', 'S128'), ('v0', np.float32)]) groundtruth_filename = os.path.join( app.config['GROUNDTRUTH_FOLDER'], Competition.query.get(competition_id).groundtruth) groundtruth = np.fromregex(groundtruth_filename, regex, [('id', 'S128'), ('v0', np.float32)]) # sort data predictions.sort(order='id') groundtruth.sort(order='id') if predictions['id'].size == 0 or not np.array_equal( predictions['id'], groundtruth['id']): raise ParsingError( "Error parsing the submission file. Make sure it has the right format and contains the right ids." ) # partition the data indices into two sets and evaluate separately splitpoint = int(np.round(len(groundtruth) * 0.15)) score_p = roc_auc_score(groundtruth['v0'][:splitpoint], predictions['v0'][:splitpoint]) score_f = roc_auc_score(groundtruth['v0'][splitpoint:], predictions['v0'][splitpoint:]) return (score_p, score_f)
def toNumpy(self, regexp, dtypes): import numpy as np try: return np.fromregex(StringIO(self.data), regexp, dtypes) except TypeError: from PyFoam.ThirdParty.six import BytesIO, b return np.fromregex(BytesIO(b(self.data)), regexp, dtypes)
def get_ssi_dict(dirpath): """save SSI data to a dictionary, where each HDF5 file is an entry""" dirlist = sorted(os.listdir(dirpath)) # regex = "\s+(\d+[.]\d+)\s+[- ]\d+[.]\d+\s+([- ]\d+[.]\d+)\s+(\d+[.]\d+)\s+(?:\d+[.]\d+e.\d+|\d+[.])\s+(?:\d+[.]\d+e.\d+|\d+[.])\s+(?:\d+[.]\d+e.\d+|\d+[.])\s+\d+[.]\d+e.\d+\s+\d+[.]\d+e.\d+\s+((?:\d+[.]\d+e.\d+|\d+[.]))\s+\S+" regex = "\s+(\d+[.]\d+)\s+[- ]\d+[.]\d+\s+([- ]\d+[.]\d+)\s+(\d+[.]\d+)\s+(?:\d+[.]\d+e.\d+|\d+[.])\s+(?:\d+[.]\d+e.\d+|\d+[.])\s+(?:\d+[.]\d+e.\d+|\d+[.])\s+\d+[.]\d+e.\d+\s+\d+[.]\d+e.\d+\s+((?:\d+[.]\d+e.\d+|\d+[.]))\s+((?:\d+[.]\d+e.\d+|\d+[.]))\s+\S+" ssi_dict = {} for each_filename in dirlist: h5_filename = each_filename[:27] data = np.fromregex(os.path.join(dirpath, each_filename), regex, dtype={ 'names': ('alt', 'lat', 'lst', 'o3', 'err'), 'formats': (np.float, np.float, np.float, np.float, np.float) }) #get ls from header line in file ls_data = np.fromregex(os.path.join(dirpath, each_filename), "\s+(\d+.\d+)\s+-\s+L_S", [('ls', np.float)]) ls = ls_data["ls"][0] #not always data!! if len(data["alt"]) > 0: middle_index = int(len(data["alt"]) / 2) ssi_dict[h5_filename] = { "ls": ls, "lat": data["lat"][middle_index], "lst": data["lst"][middle_index], "alt": data["alt"], "o3": data["o3"], "err": data["err"] } return ssi_dict
def get_scores(filename, competition_id): "Returns (preview_score, score)" regex = r'(\d+),(.+)' # parse files # filename = "C:\\Users\\jerem\\Documents\\ESTIAM\\UE Datascience\\test_only_labels.csv" predictions = np.fromregex(filename, regex, [('id', np.int64), ('v0', 'S128')]) groundtruth_filename = os.path.join( app.config['GROUNDTRUTH_FOLDER'], Competition.query.get(competition_id).groundtruth) groundtruth = np.fromregex(groundtruth_filename, regex, [('id', np.int64), ('v0', 'S128')]) # sort data predictions.sort(order='id') groundtruth.sort(order='id') if predictions['id'].size == 0 or not np.array_equal( predictions['id'], groundtruth['id']): raise ParsingError("Error parsing the submission file. Make sure it" + "has the right format and contains the right ids.") # partition the data indices into two sets and evaluate separately splitpoint = int(np.round(len(groundtruth) * 0.15)) score_p = accuracy_score(groundtruth['v0'][:splitpoint], predictions['v0'][:splitpoint]) score_f = accuracy_score(groundtruth['v0'][splitpoint:], predictions['v0'][splitpoint:]) return (score_p, score_f)
def parse(self,xmlHeaderFile,quick=True): """ Parse the usefull part of the xml header, stripping time stamps and non ascii characters """ #to strip the non ascii characters t = "".join(map(chr, list(range(256)))) d = "".join(map(chr, list(range(128,256)))) if sys.version_info > (3, 0): trans = str.maketrans('','',d) lightXML = io.StringIO() else: import StringIO trans = {ord(c): None for c in d} lightXML = StringIO.StringIO() if not quick: #store all time stamps in a big array timestamps = np.fromregex( xmlHeaderFile, r'<TimeStamp HighInteger="(\d+)" LowInteger="(\d+)"/>', float ) xmlHeaderFile.seek(0) relTimestamps = np.fromregex( xmlHeaderFile, r'<RelTimeStamp Time="(\f+)" Frame="(\d+)"/>|<RelTimeStamp Frame="[0-9]*" Time="[0-9.]*"/>', float ) xmlHeaderFile.seek(0) if sys.version_info > (3, 0): for line in xmlHeaderFile: lightXML.write(line.translate(trans)) else: for line in xmlHeaderFile: try: lightXML.write(line.translate(t,d)) except TypeError: lightXML.write(line.translate(trans)) else: #to strip the time stamps m = re.compile( r'''<TimeStamp HighInteger="[0-9]*" LowInteger="[0-9]*"/>|''' +r'''<RelTimeStamp Time="[0-9.]*" Frame="[0-9]*"/>|''' +r'''<RelTimeStamp Frame="[0-9]*" Time="[0-9.]*"/>''' ) if sys.version_info > (3, 0): for line in xmlHeaderFile: lightXML.write(''.join(m.split(line)).translate(trans)) else: for line in xmlHeaderFile: try: lightXML.write(''.join(m.split(line)).translate(t,d)) except TypeError: lightXML.write(''.join(m.split(line)).translate(trans)) lightXML.seek(0) self.xmlHeader = parse(lightXML)
def load_wrl(self, path): regexp = r"\s([+-]?[0-9]*[.]?[0-9]+[e]?[+-]?[0-9]*)\s([+-]?[0-9]*[.]?[0-9]+[e]?[+-]?[0-9]*)\s([+-]?[0-9]*[.]?[0-9]+[e]?[+-]?[0-9]*)," regexp2 = r"\s([+-]?[0]*[.]?[0-9]+[e]?[+-]?[0-9]*)\s([+-]?[0]*[.]?[0-9]+[e]?[+-]?[0-9]*)," regexp3 = r"\s([+-]?[0-9]*),\s([+-]?[0-9]*),\s([+-]?[0-9]*), -1," self.vertices = np.fromregex(path + ".wrl", regexp, ('f')) self.uv3d = np.fromregex(path + ".wrl", regexp2, ('f')) self.faces = np.fromregex(path + ".wrl", regexp3, ('i')) self.image = cv.imread(path + '.bmp') [self.h, self.w, _] = self.image.shape
def readMaskRegion(regfile): box = numpy.fromregex(regfile,r"box\(([0-9]*\.?[0-9]+),(-[0-9]*\.?[0-9]+),([0-9]*\.?[0-9]+)\",([0-9]*\.?[0-9]+)\",([0-9]*\.?[0-9]+)", [('xc',numpy.float),('yc',numpy.float),('width',numpy.float),('height',numpy.float),('angle',numpy.float)]) try: box[0] except IndexError: print 'Assuming a positive declination region.' box = numpy.fromregex(regfile,r"box\(([0-9]*\.?[0-9]+),([0-9]*\.?[0-9]+),([0-9]*\.?[0-9]+)\",([0-9]*\.?[0-9]+)\",([0-9]*\.?[0-9]+)",[('xc',numpy.float),('yc',numpy.float),('width',numpy.float),('height',numpy.float),('angle',numpy.float)]) return box[0]
def toNumpy(self, regexp, dtypes): """Assume that the unparsed data contains line-wise data and transform it to a numpy-array. @param regexp: regular expression where the groups correspond to the dtypes, @param dtypes: list with dtypes""" import numpy as np try: return np.fromregex(StringIO(self.data), regexp, dtypes) except TypeError: from PyFoam.ThirdParty.six import BytesIO, b return np.fromregex(BytesIO(b(self.data)), regexp, dtypes)
def load(filename): # -> (p_vs, faces) # Check face index format with_normal, f_pattern = check_index_format(filename) # Actually parse file via numpy.fromregex obj_v = np.fromregex(filename, *v_pattern) obj_f = np.fromregex(filename, *f_pattern) p_vs = np.stack([obj_v['x'], obj_v['y'], obj_v['z']] ).T faces = np.stack([obj_f['v0'], obj_f['v1'], obj_f['v2']]).T - 1 return p_vs, faces
def loadobj(filename, load_normals=False): """ load a wavefront obj file loads vertices into a (x,y,z) struct array and vertex indices into a n x 3 index array only loads obj files vertex positions and also only works with triangle meshes """ vertices = np.fromregex(open(filename), _vertex_regex, np.float) if load_normals: normals = np.fromregex(open(filename), _normal_regex, np.float) triangles = np.fromregex(open(filename), _triangle_regex, np.int) - 1 # 1-based indexing in obj file format! if load_normals: return vertices, normals, triangles else: return vertices, triangles
def day5(): """Counting line intersections.""" crds = np.fromregex( "5.txt", r"(\d+),(\d+) -> (\d+),(\d+)", [("x0", np.int16), ("y0", np.int16), ("x1", np.int16), ("y1", np.int16)], ) grid = np.zeros( (max(map(max, crds[["x0", "x1"]])) + 1, max(map(max, crds[["y0", "y1"]])) + 1), dtype=np.int16, ) for x0, y0, x1, y1 in crds: if x0 == x1 or y0 == y1: # horiz/vert grid[min(y0, y1) : max(y0, y1) + 1, min(x0, x1) : max(x0, x1) + 1] += 1 res1 = (grid > 1).sum() for x0, y0, x1, y1 in crds: if abs(x1 - x0) == abs(y1 - y0): # diag yd = 1 if y1 >= y0 else -1 xd = 1 if x1 >= x0 else -1 for y, x in zip(range(y0, y1 + yd, yd), range(x0, x1 + xd, xd)): grid[y, x] += 1 res2 = (grid > 1).sum() return res1, res2
def read_volumes_global_areas(seg_stats_file): """Returns the volumes of big global areas such as the ICV, Left/Right hemisphere cortical gray/white matter volume, Subcortical gray matter volume and Supratentorial volume etc. Order of the return values is as it appears in the original aseg.stats file (not as mentioned above). """ # Snippet from the relevant part of the aseg.stats # Measure lhCortex, lhCortexVol, Left hemisphere cortical gray matter volume, 234615.987869, mm^3 # Measure rhCortex, rhCortexVol, Right hemisphere cortical gray matter volume, 260948.684264, mm^3 # Measure Cortex, CortexVol, Total cortical gray matter volume, 495564.672133, mm^3 # Measure lhCorticalWhiteMatter, lhCorticalWhiteMatterVol, Left hemisphere cortical white matter volume, 222201.531250, mm^3 # Measure rhCorticalWhiteMatter, rhCorticalWhiteMatterVol, Right hemisphere cortical white matter volume, 232088.671875, mm^3 # Measure CorticalWhiteMatter, CorticalWhiteMatterVol, Total cortical white matter volume, 454290.203125, mm^3 # Measure SubCortGray, SubCortGrayVol, Subcortical gray matter volume, 188561.000000, mm^3 # Measure TotalGray, TotalGrayVol, Total gray matter volume, 684125.672133, mm^3 # Measure SupraTentorial, SupraTentorialVol, Supratentorial volume, 1046623.140109, mm^3 # Measure IntraCranialVol, ICV, Intracranial Volume, 1137205.249190, mm^3 wb_regex_pattern = r'# Measure ([\w/+_\- ]+), ([\w/+_\- ]+), ([\w/+_\- ]+), ([\d\.]+), ([\w/+_\-^]+)' datatypes = np.dtype('U100,U100,U100,f8,U10') stats = np.fromregex(seg_stats_file, wb_regex_pattern, dtype=datatypes) wb_data = np.array([seg[3] for seg in stats]) return wb_data.flatten()
def load_sv(fname, format=None): if format is None: return np.genfromtxt(fname) floatingReString=r'([-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?)' complexReString =r'\(\s*'+floatingReString+'\s*,\s*'+floatingReString+'\s*\)' return np.fromregex(fname,format.replace(r'+',r'\s*').replace('f',floatingReString).replace('c',complexReString),np.float)
def parse_x_y_dut1_from_finals_all(f): return np.fromregex(f, _R, [ ('utc_mjd', float), ('x_arcseconds', float), ('y_arcseconds', float), ('dut1', float), ])
def readheader(catalog): ''' This function extracts the #ttype indexed header of a catalog. Input: catalog = [string], Name (perhaps including path) of the catalog that contains all of the data (e.g. x,y,e1,e2,...). Must include ttype header designations for the columns e.g.: #ttype0 = objid #ttype1 = x Output: dic = dictonary that contains the {ttype string,column #}. ''' import numpy import sys header = numpy.fromregex(catalog,r"ttype([0-9]+)(?:\s)?=(?:\s)?(\w+)", [('column',numpy.int64),('name','S20')]) # Determine if the catalog is 0 or 1 indexed and if 1 indexed then change to 0 if header['column'][0] == 1: header['column']-=1 elif header['column'][0] != 0: print 'readheader: the catalog is not ttype indexed, please index using format ttype(column#)=(column name), exiting' sys.exit() for i in range(len(header)): if i == 0: dic = {header[i][1]:header[i][0]} else: dic[header[i][1]]=header[i][0] return dic
def get_goddard_dict(filepath): """save Goddard data to a dictionary, where each HDF5 file is an entry""" regex = r"\s+".join([NUM_MATCH1] + [NUM_MATCH] * 2 + [TEXT_MATCH]) # regex = "\s+(\d+[.]\d+)\s+([- ]\d+[.]\d+)\s+(\d+[.]\d+)\s+(\d+[.]\d+)\s+((?:\d+[.]\d+e.\d+|\d+[.]))\s+((?:\d+[.]\d+e[+]\d+|\d+[.]))\s+(\S+)\s" data = np.fromregex(filepath, regex, dtype={ 'names': ('alt', 'o3', 'o3_error', 'filename'), 'formats': (np.float, np.float, np.float, 'U30') }) filenames = data["filename"].tolist() filenames_unique = list(set(filenames)) goddard_dict = {} for each_filename in filenames_unique: indices = [ index for index, value in enumerate(filenames) if value == each_filename ] # middle_index = int(len(indices)/2) goddard_dict[each_filename] = { # "ls":data["ls"][indices[0]], # "lat":data["lat"][indices[middle_index]], # "lst":data["lst"][indices[middle_index]], "alt": data["alt"][indices], "o3": data["o3"][indices], "o3_error": data["o3_error"][indices], } return goddard_dict
def parse_x_y_dut1_from_finals_all(f): data = np.fromregex(f, _R, [ ('mjd_utc', np.float32), ('x', np.float32), ('y', np.float32), ('dut1', np.float32), ]) return data['mjd_utc'], data['x'], data['y'], data['dut1']
def read_orca_trj(fname): """return numpy 2D array """ # http://stackoverflow.com/questions/14645789/ # numpy-reading-file-with-filtering-lines-on-the-fly import numpy as np regexp = r'\s+\w+' + r'\s+([-.0-9]+)' * 3 + r'\s*\n' return np.fromregex(fname, regexp, dtype='f')
def readregions(regfile): ''' regfile = (string) the ds9 region file, assumes that it was written using 'ds9' Format and 'image' Coordinate System Currently this function only works on circles, ellipse, and box regions ''' # find all the circle regions circ = numpy.fromregex(regfile,r"circle\(([0-9]*\.?[0-9]+),([0-9]*\.?[0-9]+),([0-9]*\.?[0-9]+)",[('xc',numpy.float),('yc',numpy.float),('rc',numpy.float)]) # find all the elliptical regions ellip = numpy.fromregex(regfile,r"ellipse\(([0-9]*\.?[0-9]+),([0-9]*\.?[0-9]+),([0-9]*\.?[0-9]+),([0-9]*\.?[0-9]+),([0-9]*\.?[0-9]+)",[('xc',numpy.float),('yc',numpy.float),('a',numpy.float),('b',numpy.float),('angle',numpy.float)]) # find all the box regions box = numpy.fromregex(regfile,r"box\(([0-9]*\.?[0-9]+),([0-9]*\.?[0-9]+),([0-9]*\.?[0-9]+),([0-9]*\.?[0-9]+),([0-9]*\.?[0-9]+)",[('xc',numpy.float),('yc',numpy.float),('width',numpy.float),('height',numpy.float),('angle',numpy.float)]) return circ, ellip, box
def get_stocks(): global gldict for name in names: filename = "%s.csv" % name if not os.path.exists(filename): yh_download(name) arr = fromregex(filename, yf_regex, yf_dtype) gldict[name] = arr
def get_ordered_dict(self, fpath): for regex in REGEXES: data = np.fromregex(fpath, *regex) if len(data): break if data == None: raise Exception('no matches for any regex (%s)' % (fpath,)) names = set(data['name']) # TEST, TRAIN, TRAIN_BCH, TRAIN_EP, ... # pdb.set_trace() table = {name: data[data['name'] == name] for name in names} return OrderedDict(sorted(table.items(), key=lambda tup: len(tup[1]), reverse=True))
def test_record(self): c = StringIO.StringIO() c.write('1.312 foo\n1.534 bar\n4.444 qux') c.seek(0) dt = [('num', np.float64), ('val', 'S3')] x = np.fromregex(c, r"([0-9.]+)\s+(...)", dt) a = np.array([(1.312, 'foo'), (1.534, 'bar'), (4.444, 'qux')], dtype=dt) assert_array_equal(x, a)
def test_record_3(self): c = StringIO.StringIO() c.write('1312 foo\n1534 bar\n4444 qux') c.seek(0) dt = [('num', np.float64)] x = np.fromregex(c, r"(\d+)\s+...", dt) a = np.array([(1312,), (1534,), (4444,)], dtype=dt) assert_array_equal(x, a)
def read_tag_data_from_file(tag): file = "D:/Descargas/Universidad/TFG/analisis/sensores/" + tag + ".txt" return np.fromregex(file, r"POS,(.+),(.+),(.+),(.+)", [('x', np.float64), ('y', np.float64), ('z', np.float64), ('qf', np.int32)])[1:]
def test_record(self): c = StringIO() c.write(asbytes("1.312 foo\n1.534 bar\n4.444 qux")) c.seek(0) dt = [("num", np.float64), ("val", "S3")] x = np.fromregex(c, r"([0-9.]+)\s+(...)", dt) a = np.array([(1.312, "foo"), (1.534, "bar"), (4.444, "qux")], dtype=dt) assert_array_equal(x, a)
def test_record_2(self): c = StringIO() c.write(asbytes("1312 foo\n1534 bar\n4444 qux")) c.seek(0) dt = [("num", np.int32), ("val", "S3")] x = np.fromregex(c, r"(\d+)\s+(...)", dt) a = np.array([(1312, "foo"), (1534, "bar"), (4444, "qux")], dtype=dt) assert_array_equal(x, a)
def test_record_2(self): c = StringIO.StringIO() c.write('1312 foo\n1534 bar\n4444 qux') c.seek(0) dt = [('num', np.int32), ('val', 'S3')] x = np.fromregex(c, r"(\d+)\s+(...)", dt) a = np.array([(1312, 'foo'), (1534, 'bar'), (4444, 'qux')], dtype=dt) assert_array_equal(x, a)
def test_record_3(self): c = StringIO.StringIO() c.write('1312 foo\n1534 bar\n4444 qux') c.seek(0) dt = [('num', np.float64)] x = np.fromregex(c, r"(\d+)\s+...", dt) a = np.array([(1312, ), (1534, ), (4444, )], dtype=dt) assert_array_equal(x, a)
def parseSequences(Regexp, Filename, RemoveLineEnding): Data = np.fromregex(Filename, Regexp, [('id', object), ('sequence', object)]) if RemoveLineEnding == True: for i in range(len(Data)): #Data[i][1].replace('\n', '').replace('\r', '') Data[i][1] = (re.sub(r"[\r\n\s]+", r"", Data[i][1])) Data[i][1] = Data[i][1].upper() return Data
def test_record_2(self): return # pass this test until #736 is resolved c = StringIO.StringIO() c.write("1312 foo\n1534 bar\n4444 qux") c.seek(0) dt = [("num", np.int32), ("val", "S3")] x = np.fromregex(c, r"(\d+)\s+(...)", dt) a = np.array([(1312, "foo"), (1534, "bar"), (4444, "qux")], dtype=dt) assert_array_equal(x, a)
def do_once(self): self.file.seek(0) output = numpy.fromregex( self.file, self.line_re, [('ip', 'S20'), ('day', 'S25'), ('month', 'S20'), ('year', 'S4'), ('time', 'S20'), ('method', 'S7'), ('path', 'S100'), ('size', numpy.int32)] ) total_time_by_month = defaultdict(int) for row in output: total_time_by_month[(row[2], row[1])] += row[7]
def load_dataset(csv_file, image_root, fail_on_missing=True): """ Loads a dataset .csv file, returning PIDs and FIDs. PIDs are the "person IDs", i.e. class names/labels. FIDs are the "file IDs", which are individual relative filenames. Args: csv_file (string, file-like object): The csv data file to load. image_root (string): The path to which the image files as stored in the csv file are relative to. Used for verification purposes. If this is `None`, no verification at all is made. fail_on_missing (bool or None): If one or more files from the dataset are not present in the `image_root`, either raise an IOError (if True) or remove it from the returned dataset (if False). Returns: (pids, fids) a tuple of numpy string arrays corresponding to the PIDs, i.e. the identities/classes/labels and the FIDs, i.e. the filenames. Raises: IOError if any one file is missing and `fail_on_missing` is True. """ #np.fromregex(csv_file, r'(\d+),"(.+)"', np.object) if 'clothing1m' in csv_file: dataset = np.fromregex(csv_file, r'(\d+),"(.+)"', np.object) else: dataset = np.genfromtxt(csv_file, delimiter=',', dtype='|U') pids, fids = dataset.T # Possibly check if all files exist if image_root is not None: missing = np.full(len(fids), False, dtype=bool) for i, fid in enumerate(fids): missing[i] = not os.path.isfile(os.path.join(image_root, fid)) missing_count = np.sum(missing) if missing_count > 0: if fail_on_missing: raise IOError('Using the `{}` file and `{}` as an image root {}/' '{} images are missing'.format( csv_file, image_root, missing_count, len(fids))) else: print('[Warning] removing {} missing file(s) from the' ' dataset.'.format(missing_count)) # We simply remove the missing files. fids = fids[np.logical_not(missing)] pids = pids[np.logical_not(missing)] return pids, fids
def combine_history_statsfiles(cnavgdir): statsfiles = glob.glob(cnavgdir + "/" + "HISTORY_STATS*") sys.stderr.write("statsfiles: %s\n" % (str(statsfiles))) mystats = np.array([]) mysims = [] runlens = [] for statsfile in statsfiles: sim = int(re.match(".*HISTORY_STATS_(\d+)", statsfile).group(1)) mysims.append(sim) historystats = np.fromregex(statsfile, r"\((\d+), (\d+)\)\t(\d+)\t(\d+)\t(\d+)\t(\d+)", dtype=int) runlens.append(historystats.shape[0]) runlen = max(runlens) for sim in mysims: statsfile = os.path.join(cnavgdir, "HISTORY_STATS_%d" % sim) historystats = np.fromregex(statsfile, r"\((\d+), (\d+)\)\t(\d+)\t(\d+)\t(\d+)\t(\d+)", dtype=int) if mystats.size == 0: mystats = np.zeros(((max(mysims) + 1) * runlen, historystats.shape[1] + 1), dtype=int) hids = np.array(range(historystats.shape[0])) + sim * Global_BINWIDTH i = sim * runlen mystats[i : i + runlen, :] = np.hstack((np.atleast_2d(hids).T, historystats)) return mystats
def parseTEM(filename): f=open(filename,"r") regexp = r"[-+]?[0-9]*\.?[0-9]+" res=numpy.fromregex(filename, regexp,[('num', numpy.float), ]) res2=res['num'][3:].reshape((int(res['num'][1]),6)) pos,rot = numpy.hsplit(res2, 2) M = []#[eulerToMatrix(r) for r in rot] for i in range(len(pos)): mrot = eulerToMatrix(rot[i]) mrot[3][:3] = pos[i] M.append(mrot) return M
def readFromFile_CoordsXYZ(FN, forceRead=False): # Sanity check filename: if not forceRead: doRegexMatch(FN,names.pixelCoordsFNRE) assert (fileExists(FN)) # Read (x,y,z) coordinate: #dataRegex = myRE.compile(myRE.withinSpaces(myRE.floatREx3)) dataRegex = myRE.compile(myRE.floatREx3) coords = np.fromregex(FN,dataRegex,dtype='f') # Return the array return coords
def main(fname): vs = np.fromregex( fname, "V\s+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)", [("index", np.int32), ("x", np.float64), ("y", np.float64), ("z", np.float64)], ) vs = np.array([[i for i in arr] for arr in vs])[:, 1:] ts = np.fromregex( fname, "T\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)", [("v0", np.int32), ("v1", np.int32), ("v2", np.int32), ("v3", np.int32)], ) ts = np.array([[i for i in arr] for arr in ts]).astype(int) print(vs.shape, ts.shape) # filter dummy vertices ts = ts[np.all((ts < 2) | (ts > 6), axis=1), :] plot(vs, ts)
def _parse_input_csv(block_offset_size_queue, csv_filepath, numpy_array_queue): """Parse CSV file lines to (datetime64[d], userhash, lat, lng) tuples. Parameters: block_offset_size_queue (multiprocessing.Queue): contains tuples of the form (offset, chunk size) to direct where the file should be read from numpy_array_queue (multiprocessing.Queue): output queue will have paths to files that can be opened with numpy.load and contain structured arrays of (datetime, userid, lat, lng) parsed from the raw CSV file csv_filepath (string): path to csv file to parse from Returns: None """ for file_offset, chunk_size in iter(block_offset_size_queue.get, 'STOP'): csv_file = open(csv_filepath, 'r') csv_file.seek(file_offset, 0) chunk_string = csv_file.read(chunk_size) csv_file.close() # sample line: # 8568090486,48344648@N00,2013-03-17 16:27:27,42.383841,-71.138378,16 # this pattern matches the above style of line and only parses valid # dates to handle some cases where there are weird dates in the input pattern = r"[^,]+,([^,]+),(19|20\d\d-(?:0[1-9]|1[012])-(?:0[1-9]|[12][0-9]|3[01])) [^,]+,([^,]+),([^,]+),[^\n]" # pylint: disable=line-too-long try: chunk_string = unicode(chunk_string) except NameError: # Python 3, it's already unicode pass result = numpy.fromregex(StringIO(chunk_string), pattern, [('user', 'S40'), ('date', 'datetime64[D]'), ('lat', 'f4'), ('lng', 'f4')]) def md5hash(user_string): """md5hash userid.""" return hashlib.md5(user_string).digest()[-4:] md5hash_v = numpy.vectorize(md5hash, otypes=['S4']) hashes = md5hash_v(result['user']) user_day_lng_lat = numpy.empty(hashes.size, dtype='datetime64[D],a4,f4,f4') user_day_lng_lat['f0'] = result['date'] user_day_lng_lat['f1'] = hashes user_day_lng_lat['f2'] = result['lng'] user_day_lng_lat['f3'] = result['lat'] numpy_array_queue.put(user_day_lng_lat) numpy_array_queue.put('STOP')
def parse(self,xmlHeaderFile,quick=True): """ Parse the usefull part of the xml header, stripping time stamps and non ascii characters """ lightXML = StringIO.StringIO() #to strip the non ascii characters t = "".join(map(chr, range(256))) d = "".join(map(chr, range(128,256))) if not quick: #store all time stamps in a big array timestamps = np.fromregex( xmlHeaderFile, r'<TimeStamp HighInteger="(\d+)" LowInteger="(\d+)"/>', float ) xmlHeaderFile.seek(0) relTimestamps = np.fromregex( xmlHeaderFile, r'<RelTimeStamp Time="(\f+)" Frame="(\d+)"/>|<RelTimeStamp Frame="[0-9]*" Time="[0-9.]*"/>', float ) xmlHeaderFile.seek(0) for line in xmlHeaderFile: lightXML.write(line.translate(t,d)) else: #to strip the time stamps m = re.compile( r'''<TimeStamp HighInteger="[0-9]*" LowInteger="[0-9]*"/>|''' +r'''<RelTimeStamp Time="[0-9.]*" Frame="[0-9]*"/>|''' +r'''<RelTimeStamp Frame="[0-9]*" Time="[0-9.]*"/>''' ) for line in xmlHeaderFile: lightXML.write(''.join(m.split(line)).translate(t,d)) lightXML.seek(0) self.xmlHeader = parse(lightXML)
def load_nakamuraspr(fn): """Load a spr file as returned by Nakamura et al.'s alignment methods. Fields of the file format as specified in [8]_: ID (onset time) (offset time) (spelled pitch) (onset velocity) (offset velocity) channel These files contain extra information not included in match or corresp files, particularly duration and pedal information, and can be used to complement the information from the `load_nakamuracorresp` or `load_nakamuramatch`. Parameters ---------- fn : str The nakamura match.txt-file Returns ------- note_array : structured array structured array with note information References ---------- .. [8] https://midialignment.github.io/MANUAL.pdf TODO ---- * Import pedal information """ note_array_dtype = [("onset_sec", "f4"), ("duration_sec", "f4"), ("pitch", "i4"), ("velocity", "i4"), ("channel", "i4"), ("id", "U256")] dtype = [("ID", "U256"), ("Ontime", "f"), ("Offtime", "f"), ("Sitch", "U256"), ("Onvel", "i"), ("Offvel", "i"), ("Channel", "i")] pattern = r"(\d+)\t(.+)\t(.+)\t(.+)\t(.+)\t(.+)\t(.+)" result = np.fromregex(fn, pattern, dtype=dtype) note_array = np.empty(len(result), dtype=note_array_dtype) note_array["id"] = result["ID"] note_array["onset_sec"] = result["Ontime"] note_array["duration_sec"] = result["Offtime"] - result["Ontime"] note_array["pitch"] = np.array( [note_name_to_midi_pitch(n) for n in result["Sitch"]]) note_array["velocity"] = result["Onvel"] note_array["channel"] = result["Channel"] return note_array
def estimate_band_coefficients(self, sat=None, instr=None, ch=None): """Estimate band coefficients for fast/explicit BT calculations In some circumstances, a fully integrated SRF may be more expensive than needed. We can then choose an effective wavelength lambda_c along with coefficients alpha, beta such that instead of integrating, we estimate R = B(lambda*, T*), with T* = alpha + beta · T_B and lambda* a wavelength which may be close to the centroid lambda_c (but there is no guarantee). Such an approximation eliminates the explicit use of an integral which can make analysis easier. Returns: alpha (float): Offset in approximation for T* beta (float): Slope in approximation for T* lambda_eff (float): Effective wavelength delta_alpha (float): Uncertainty in alpha delta_beta (float): Uncertainty in beta delta_lambda_eff (float): Uncertainty in lambda_eff """ warnings.warn("Obtaining band coefficients from file", UserWarning) srcfile = config.conf[instr]["band_file"].format(sat=sat) rxp = r"(.{5,6})_ch(\d\d?)_shift([+-]\d+)pm\.nc\s+([\d.]+)\s+(-?[\de\-.]+)\s+([\d.]+)" dtp = [("satname", "S6"), ("channel", "u1"), ("shift", "i2"), ("centre", "f4"), ("alpha", "f4"), ("beta", "f4")] M = numpy.fromregex(srcfile, rxp, dtp).reshape(19, 7) dims = ("channel", "shiftno") ds = xarray.Dataset( { "centre": (dims, M["centre"]), "alpha": (dims, M["alpha"]), "beta": (dims, M["beta"]), "shift": (dims, M["shift"]) }, coords={"channel": M["channel"][:, 0]}) ds = ds.sel(channel=ch) ds0 = ds.sel(shiftno=0) # varies 1.1 – 15.2 nm depending on channel lambda_c = UADA(ds0["centre"], attrs={"units": "1/cm"}) alpha = UADA(ds0["alpha"], attrs={"units": "K"}) beta = UADA(ds0["beta"], attrs={"units": "1"}) delta_ds = ds.sel(shiftno=1) - ds0 delta_lambda_c = abs(UADA(delta_ds["centre"], attrs={"units": "1/cm"})) delta_alpha = abs(UADA(delta_ds["alpha"], attrs={"units": "K"})) delta_beta = abs(UADA(delta_ds["beta"], attrs={"units": "1"})) return (alpha, beta, lambda_c, delta_alpha, delta_beta, delta_lambda_c)
def main(): args = options() path=os.getcwd() #folders=open(args.folder, 'r') #for l in folders: # fname=l.replace("\n", "") # source=str(path)+"/"+str(args.imgfolder)+"/"+str(fname) # destination=str(path)+"/"+str(args.outdir) # shutil.move(source, destination) snapshots=str(path)+"/"+str(args.imgfolder)+"/SnapshotInfo.csv" #snapshot_data = genfromtxt(snapshots, delimiter=',') regex="^.*B*.*$" select=np.fromregex(snapshots,regex) print snapshot_data
def main(): args = options() path = os.getcwd() #folders=open(args.folder, 'r') #for l in folders: # fname=l.replace("\n", "") # source=str(path)+"/"+str(args.imgfolder)+"/"+str(fname) # destination=str(path)+"/"+str(args.outdir) # shutil.move(source, destination) snapshots = str(path) + "/" + str(args.imgfolder) + "/SnapshotInfo.csv" #snapshot_data = genfromtxt(snapshots, delimiter=',') regex = "^.*B*.*$" select = np.fromregex(snapshots, regex) print snapshot_data
def load_kaldi_priors(path, prior_cutoff, uniform_smoothing_scaler=0.05): assert 0 <= uniform_smoothing_scaler <= 1.0, ( "Expected 0 <= uniform_smoothing_scaler <=1, got %f" % uniform_smoothing_scaler ) numbers = np.fromregex(path, r"([\d\.e+]+)", dtype=[('num', np.float32)]) class_counts = np.asarray(numbers['num'], dtype=theano.config.floatX) # compute the uniform smoothing count uniform_priors = np.ceil(class_counts.mean() * uniform_smoothing_scaler) priors = (class_counts + uniform_priors) / class_counts.sum() #floor zeroes to something small so log() on that will be different # from -inf or better skip these in contribution at all i.e. set to -log(0)? priors[priors < prior_cutoff] = prior_cutoff assert np.all(priors > 0) and np.all(priors <= 1.0), ( "Prior probabilities outside [0,1] range." ) log_priors = np.log(priors) assert not np.any(np.isinf(log_priors)), ( "Log-priors contain -inf elements." ) return log_priors
def get_dt(self): """ Returns the time step of the simulation. Returns ------- A float in seconds. """ data_file_path = self.get_data_path() # matches floats and scientific floats rg_flt = r"([-\+[0-9]+\.[0-9]*[Ee]*[\+-]*[0-9]*)" # our UNIT_TIME is scaled to dt dt = np.fromregex( data_file_path, r"\s+UNIT_TIME " + rg_flt + r"\n", dtype=np.dtype([('dt', 'float')]) ) return dt['dt'][0]
def apache_time(s): year = int(s[7:11]) month = month_map[s[3:6]] day = int(s[0:2]) hours = int(s[12:14]) minutes = int(s[15:17]) seconds = int(s[18:20]) timezone = Timezone(s[-5:]) date_time = datetime.datetime(year, month, day, hours, minutes, seconds, 0, timezone) return date_time ary = fromregex('access_log', regex_str, [('host', object), ('user', object), ('time', object), ('request', object), ('status', int16), # I *think* this is guaranteed to be an int32... ('size', int64), # Since this can be '-', it can't be a number. ('referer', object), ('agent', object), ], size=size_converter, time=apache_time) #---------------------------------------------------------------------------- # Find all the requests that generated a 404 #---------------------------------------------------------------------------- ary404 = ary[ary['status'] == 404] ary404['request'] requests404 = set(ary404['request']) #---------------------------------------------------------------------------- # Generate a report on requests that generated 404 codes. #----------------------------------------------------------------------------
def test_fromregex(): time.sleep(0.2) regexp = r"(\d+)\s+(...)" # match [digits, whitespace, anything] return numpy.fromregex("test_fromregex.dat", regexp, [("num", numpy.int64), ("key", "S3")])
ret_regex += '(' + record[i][1] + '{' + record[i][2] + '})' return r'(' + record_type + ')' + ret_regex + r'\n' def data_type_list(record): ret_val_list = [] for i in range(0, len(record)): ret_val_list.append((record[i][0], record[i][3])) return ret_val_list #def main(): Take this out while working in ipython input_file = sys.argv[1] regexp = regex_string(pvf_weights_rec, PVF_WEIGHTING_REC_TYPE) data_types = data_type_list(pvf_weights_rec) weights_df = pd.DataFrame(np.fromregex(input_file, regexp, data_types)) del weights_df['Padding'] #Get weights for BBC network panel only. weights_df = weights_df[weights_df.ReportingPanel == BBC_NETWORK_PANEL] regexp = regex_string(pvf_members_rec, PVF_MEMBER_REC_TYPE) data_types = data_type_list(pvf_members_rec) members_df = pd.DataFrame(np.fromregex(input_file, regexp, data_types)) del members_df['Padding'] regexp = regex_string(pvf_vwg_rec, PVF_VWG_REC_TYPE) data_types = data_type_list(pvf_vwg_rec) vwg_df = pd.DataFrame(np.fromregex(input_file, regexp, data_types)) del vwg_df['Padding']
input = np.array(input, dtype=np.int64) score = input.dot(weights_matrix) if np.any(score < 0): return 0 else: return functools.reduce(operator.mul, score) def score_part2(input, weights_matrix): input = np.array(input, dtype=np.int64) score = input.dot(weights_matrix) if np.any(score < 0): return 0 elif score[-1] != 500: return 0 else: return functools.reduce(operator.mul, score[:-1]) def partitions(): for x in range(101): for y in range(101-x): for z in range((101-x)-y): for q in range(((101-x)-y)-z): yield x,y,z,q regex = r'\w+: capacity (-?\d+), durability (-?\d+), flavor (-?\d+), texture (-?\d+), calories (\d+)' weights = np.fromregex('input.txt', regex, np.int64) weights_no_cals = weights[:, :4] print(max(score(split, weights_no_cals) for split in partitions())) print(max(score_part2(split, weights) for split in partitions()))
if np.any(new_amounts < 0): continue new_score = score(new_amounts, specs) if new_score > best_score: best_score = new_score best_amounts = new_amounts return (best_amounts, best_score - cur_score) with open('input', 'r') as f: regex = r'.*: capacity (-?\d+), durability (-?\d+), flavor (-?\d+), texture (-?\d+)' specs = np.fromregex(f, regex, np.int) # Start with a decent guess num_ingredients = len(specs) total_capacity = 100 start_amount = total_capacity / num_ingredients amounts = np.ones((1, num_ingredients), np.int) * start_amount # Correct for the slack amounts[0, -1] = total_capacity - (start_amount * (num_ingredients - 1)) initial_score = score(amounts, specs) # Find the best dimensional improvement (new_amounts, new_score) = best_improvement(amounts, specs) while new_score > 0:
def loadFaceFeatures(path, filename): return np.fromregex(path + filename + '.txt', r'.*=(\d+) (\d+)\n', dtype=int)
-0.100 2.141 5.081\n -0.100 3.282 5.262\n ''') parser.add_argument('filename',help='300.txt') args = parser.parse_args() print("~ Filename: {}".format(args.filename)) # fig = plt.figure(num=None, figsize=(12, 6), dpi=120, facecolor='w', edgecolor='k') # ax = fig.gca(projection='3d') # # x = np.fromregex(c, r"(\d+)\s+...", dt) dt = np.dtype([ ('X', np.float), ('Y', np.float), ('Z', np.float)]) data = np.fromregex(args.filename,regexp,dt) # data = np.genfromtxt(args.filename) x = data['X'] y = data['Y'] z = data['Z'] # print data print x print y print z for xv in x: for yv in y: for zv in z: print " ", xv + " " + yv + " " + zv
properties[properties < 0] = 0 # We multiply the properties we have for the cookie (tuple style) return np.prod(properties[:-1]) # Splits the number of spoons for each possible ingredient def split(sums, n): if sums == 1: yield (n, ) else: # being xrange the number of spoons we can have for a in range(n+1): # Calling the function in the function... WOOOOAH! for rest in split(sums-1, n-a): # Yield is the same as return, but it returns a Generator # (it only calculates it once) yield ((a, ) + rest) regex = r'\w+: capacity (-?\d+), durability (-?\d+), flavor (-?\d+), texture (-?\d+), calories (\d+)' # We extract using the regex all the values from the int ingredients = np.fromregex('input15', regex, np.int) # max_score = -1e32 # for spoons in split(len(ingredients), 100): # max_score = max(score(spoons, ingredients), max_score) # print 'The best cookie will have a value of {} units.'.format(max_score) # We calculate the scores given the number of spoons for each ingredient (split spoons for ingredient) scores = [score(spoons, ingredients) for spoons in split(len(ingredients), 100)] print 'The best cookie will have a value of {} units.'.format(max(scores))
M, T, intercept = model MMinv = la.inv( ## implement (X'X)^{-1} (X'Y) np.dot( M.T, M ) ) coef = np.dot( MMinv, np.dot( M.T, T ) ) ## Estimate the residual standard deviation resid = T - np.dot(M, coef) dof = len( T ) - len( coef ) RSS = np.dot( resid.T, resid ) return (coef, RSS, dof, MMinv ) ##################################################################### #+ 0. Load the data (yes, it is a milestone!) ## Load the word count dataset wordcount = np.fromregex( './data/wordcounts.txt', r"(\d+)\s+(.{,32})", [ ( 'freq', np.int64 ), ( 'word', 'S32' ) ] ) ##################################################################### ##+ 1. Check that Zipf's Law holds ## Pre-sort the frequencies: in ascending order of frequencies wordcount.sort( order = 'freq' ) freqs = wordcount[ 'freq' ] ## PRoduce ranks: from 1 up to |W| ranks = np.arange( 1, len( wordcount ) + 1, dtype = float )[::-1] ## The probability of a word frequency being not less than the ## frequency of a gien word w it exactly the ratio of the w's rank ## to the total number of words. probs = ranks / len( wordcount )
mask_z = cat[:,key['z']] < -10 mask_JK = cat[:,key['J']]-cat[:,key['K']] > JK_Rz[0] mask_Rz = cat[:,key['R']]-cat[:,key['z']] < JK_Rz[1] mask_color = mask_J+mask_K+mask_z+mask_JK+mask_Rz > 0 Nint = numpy.shape(cat)[0] cat = cat[mask_color,:] Nfin = numpy.shape(cat)[0] Ncut = Nint-Nfin print 'obsplan: {0} rows were removed from the catalog with {1} initial rows, leaving {2} rows'.format(Ncut,Nint,Nfin) print 'obsplan: apply the redshift filter' cat = tools.catfilter(redshift_bounds[0],redshift_bounds[1],cat,key['z_b']) ## region file catalog filter print 'obsplan: apply the regions filter' # find all the box regions box = numpy.fromregex(regfile,r"box\(([0-9]*\.?[0-9]+),([0-9]*\.?[0-9]+),([0-9]*\.?[0-9]+)\",([0-9]*\.?[0-9]+)\",([0-9]*\.?[0-9]+)", [('xc',numpy.float),('yc',numpy.float),('width',numpy.float),('height',numpy.float),('angle',numpy.float)]) d2r = numpy.pi/180.0 #loop through the regions creating masks for galaxy inclusion for i in numpy.arange(numpy.shape(box)[0]): #phi is the ccw angle from the +East axis xc = box[i][0] yc = box[i][1] w = box[i][2] h = box[i][3] phi=box[i][4]*d2r #rotate the galaxies into the "primed" (p) region coorditate frame centered #at the center of the region ra_p = (cat[:,key['ra']]-xc)*numpy.cos(yc*d2r)*numpy.cos(-phi)+(cat[:,key['dec']]-yc)*numpy.sin(-phi) dec_p = -(cat[:,key['ra']]-xc)*numpy.cos(yc*d2r)*numpy.sin(-phi)+(cat[:,key['dec']]-yc)*numpy.cos(-phi) #determine the min and max bounds of the region
inputPrefix_noType = inputPrefix_noType + inputPrefix[i] if i < (len(inputPrefix)-2): inputPrefix_noType = inputPrefix_noType + '.' s='_%s.pdf' % (dim) outputName = inputPrefix_noType+s ######################## ## Load input file ##### ######################## regexp = r"([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)" output = np.fromregex(sys.argv[1], regexp, [('x', np.float), ('z', np.float), ('dens', np.float), ('fx', np.float), ('fz', np.float) ]) X = np.reshape(output['x'], (26,26)) Y = np.reshape(output['z'], (26,26)) D = np.reshape(output['dens'], (26,26)) U = np.transpose(np.reshape(output['fx'], (26,26))) V = np.transpose(np.reshape(output['fz'], (26,26))) for i in range(26): for j in range(26): if U[i][j] < threshold: U[i][j] = 0 if V[i][j] < threshold: V[i][j] = 0 speed = np.sqrt(U*U + V*V)
gf_dtype = [('day', 'i2'), ('month', 'S3'), ('year', 'i4'), ('open', 'f4'), ('high', 'f4'), ('low', 'f4'), ('close', 'f4'), ('volume', 'i4')] yf_regex = r"(\d{4})-(\d{2})-(\d{2}),(\d+[.]\d+),(\d+[.]\d+),(\d+[.]\d+),(\d+[.]\d+),(\d+),(\d+[.]\d+)" yf_dtype = [('year', 'i4'), ('month', 'i2'), ('day', 'i2'), ('open', 'f4'), ('high', 'f4'), ('low', 'f4'), ('close', 'f4'), ('volume', 'i4'), ('adj_close', 'f4')] def load_quote(quote, overwrite=False): filename = "%s.csv" % quote if overwrite is True or \ (overwrite is False and not os.path.exists(filename)): try: yh_download(quote) except urllib2.HTTPError, e: print "Error with %s" % quote return None return fromregex(filename, yf_regex, yf_dtype) def old_method(): names = ['yhoo', 'goog', 'ibm', 'ge', 'msft', 'aapl', 'ebay', 'dell', 'csco', 'siri'] quotes = dict() for name in names: arr = load_quote(name) quotes[name] = arr return quotes def load_eurostoxx50(overwrite=False): """ Loads the EuroStoxx 50 index components and returns a dictionnary """ stocks = ["ABI.BR", "ACA.PA", "AGN.AS", "AI.PA", "ALO.PA", "ALV.DE",