def getAllPartiesFromContributions(self, table): self.display(table, 'initial table') recipientsTable = table['recipient_id'] self.display(recipientsTable, 'recipientsTable') uniqueRecipientsTable = np.unique(recipientsTable) self.display(uniqueRecipientsTable, 'uniqueRecipientsTable') uniqueCandidateRecipientsTable = np.array(list( filter(lambda recipient: recipient.startswith('N'), uniqueRecipientsTable)), dtype=[('recipient_id', 'O') ]) self.display(uniqueCandidateRecipientsTable, 'uniqueCandidateRecipientsTable') uniqueCommitteeRecipientsTable = np.array(list( filter(lambda recipient: recipient.startswith('C'), uniqueRecipientsTable)), dtype=[('recipient_id', 'O') ]) self.display(uniqueCommitteeRecipientsTable, 'uniqueCommitteeRecipientsTable') candidatesTable = self.tables['candidates'].copy() self.display(candidatesTable, 'candidatesTable') candidatesTableColumns = [ column for column in self.sourceData['candidates'][0] ] candidatesTableColumns[0] = 'recipient_id' candidatesTable.dtype.names = tuple(candidatesTableColumns) committeesTable = self.tables['committees'].copy() self.display(committeesTable, 'committeesTable') committeesTableColumns = [ column for column in self.sourceData['committees'][0] ] committeesTableColumns[0] = 'recipient_id' committeesTable.dtype.names = tuple(committeesTableColumns) candidateRecipientsTable = rfn.join_by('recipient_id', candidatesTable, uniqueCandidateRecipientsTable, jointype='inner', usemask=False) self.display(candidateRecipientsTable, 'candidateRecipientsTable') committeeRecipientsTable = rfn.join_by('recipient_id', committeesTable, uniqueCommitteeRecipientsTable, jointype='inner', usemask=False) self.display(committeeRecipientsTable, 'committeeRecipientsTable') candidatePartiesTable = np.unique(candidateRecipientsTable['party']) self.display(candidatePartiesTable, 'candidatePartiesTable') committeePartiesTable = np.unique(committeeRecipientsTable['party']) self.display(committeePartiesTable, 'committeePartiesTable') return candidatePartiesTable, committeePartiesTable
def _get_data(self, ip): # Read summary tshark = ('tshark -r %s ' '-q -z icmpv6,srt,' 'ipv6.addr==%s' '|tail -n5|head -n1') % (self.filename, ip) p = self.env.run_host(tshark) summary = p.stdout.readline().decode().split() # Get longest window tshark = ( 'tshark -r %s ' '-T fields -e frame.time_relative -e icmpv6.echo.sequence_number ' '"ipv6.addr==%s&&icmpv6.type==%i" ' '-E separator=,') # %(filename, ip, icmpv6.type) p = self.env.run_host(tshark % (self.filename, ip, 128)) reqs = p.stdout.readlines() data = [ numpy.fromstring(line.decode().strip(), dtype=float, sep=',') for line in reqs ] reqs = numpy.vstack(data) if data else numpy.array([[]]) reqs.dtype = [('time_req', float), ('id', float)] p = self.env.run_host(tshark % (self.filename, ip, 129)) reps = p.stdout.readlines() data = [ numpy.fromstring(line.decode().strip(), dtype=float, sep=',') for line in reps ] reps = numpy.vstack(data) if data else numpy.array([[]]) reps.dtype = [('time_rep', float), ('id', float)] max_offline = '<NA>' if reqs.size > 0: res = rf.join_by('id', reps, reqs, jointype='outer') # Find largest "True" max_offline = 0 current_offline = 0 last_sent = 0 i = 0 while i < res.size: #Offline window: while i < res.size and res.mask['time_rep'][i]: i += 1 if i < res.size: current_offline = res.data['time_rep'][i] - last_sent else: current_offline = res.data['time_req'][i - 1] - last_sent #Online window: while i < res.size and not res.mask['time_rep'][i]: last_sent = res.data['time_req'][i] i += 1 if current_offline > max_offline: max_offline = current_offline current_offline += 1 # Format data: data = dict() headers = ['Filename', 'Requests', 'Replies', 'Lost', 'Max_offline'] values = [ self.filename, summary[0], summary[1], summary[2], max_offline ] return dict(zip(headers, values))
def test_different_field_order(self): # gh-8940 a = np.zeros(3, dtype=[('a', 'i4'), ('b', 'f4'), ('c', 'u1')]) b = np.ones(3, dtype=[('c', 'u1'), ('b', 'f4'), ('a', 'i4')]) # this should not give a FutureWarning: j = join_by(['c', 'b'], a, b, jointype='inner', usemask=False) assert_equal(j.dtype.names, ['b', 'c', 'a1', 'a2'])
def filter_effects(self): """ Merge effects and data, and flip effect alleles """ effect_positions=self.effects[["CHR", "POS"]] data_positions=self.data.snp[["CHR", "POS"]] effect_include=np.in1d(effect_positions, data_positions) data_include=np.in1d(data_positions, effect_positions) self.data.filter_snps(data_include) self.effects=self.effects[effect_include] # Just give up and convert to float. I have no idea why int doesn't work here # but it's something to do with the fact that you can't have None as a numpy int # wheras float gets converted to nan. tmp_data=nprec.append_fields(self.data.snp, "GENO", None, dtypes=[(float,self.data.geno.shape[1])],usemask=False) tmp_data["GENO"]=self.data.geno self.effects=nprec.join_by(["CHR", "POS"], self.effects, tmp_data, usemask=False, jointype="inner") flipped=0 removed=0 for rec in self.effects: if rec["EFFECT"]==rec["REF"] and rec["OTHER"]==rec["ALT"]: pass elif rec["OTHER"]==rec["REF"] and rec["EFFECT"]==rec["ALT"]: flipped+=1 rec["OTHER"]=rec["ALT"] rec["EFFECT"]=rec["REF"] rec["BETA"]=-rec["BETA"] else: removed+=1 rec["EFFECT"]=rec["OTHER"]="N" self.effects=self.effects[self.effects["EFFECT"]!="N"] print( "Removed "+str(removed)+" non-matching alleles",file=sys.stderr) print( "Flipped "+str(flipped)+" alleles",file=sys.stderr)
def test_two_keys_two_vars(self): a = np.array(list( zip(np.tile([10, 11], 5), np.repeat(np.arange(5), 2), np.arange(50, 60), np.arange(10, 20))), dtype=[('k', int), ('a', int), ('b', int), ('c', int)]) b = np.array(list( zip(np.tile([10, 11], 5), np.repeat(np.arange(5), 2), np.arange(65, 75), np.arange(0, 10))), dtype=[('k', int), ('a', int), ('b', int), ('c', int)]) control = np.array([(10, 0, 50, 65, 10, 0), (11, 0, 51, 66, 11, 1), (10, 1, 52, 67, 12, 2), (11, 1, 53, 68, 13, 3), (10, 2, 54, 69, 14, 4), (11, 2, 55, 70, 15, 5), (10, 3, 56, 71, 16, 6), (11, 3, 57, 72, 17, 7), (10, 4, 58, 73, 18, 8), (11, 4, 59, 74, 19, 9)], dtype=[('k', int), ('a', int), ('b1', int), ('b2', int), ('c1', int), ('c2', int)]) test = join_by(['a', 'k'], a, b, r1postfix='1', r2postfix='2', jointype='inner') assert_equal(test.dtype, control.dtype) assert_equal(test, control)
def GetTileDefs(args, strtype='|S12'): #t = esutil.io.read(args.tiles)[args.tilecol][0:2] t = esutil.io.read(args.tiles)[args.tilecol] tindex = np.arange(len(t)) tiles = np.empty(len(t), dtype=[('tilename',strtype), ('index', np.int64)]) tiles['tilename'] = t.astype(strtype) tiles['index'] = np.arange(len(t)) if args.density is not None: for tile in tiles['tilename']: outdir = os.path.join(args.outdir, tile) if not os.path.exists(outdir): os.makedirs(outdir) cur = desdb.connect() q = "select urall, uraur, udecll, udecur, tilename from coaddtile order by udecll desc, urall asc" arr = cur.quick(q, array=True) dt = arr.dtype.descr dt[-1] = ('tilename',strtype) dt = np.dtype(dt) newarr = np.empty(len(arr), dtype=dt) for i in range(len(arr.dtype.names)): name = arr.dtype.names[i] if i == 4: newarr[name] = arr[name].astype(strtype) else: newarr[name] = arr[name] tiles = rec.join_by('tilename', newarr, tiles, usemask=False) tiles = np.sort(tiles, order='index') return tiles
def test_two_keys_two_vars(self): a = np.array( list(zip(np.tile([10, 11], 5), np.repeat(np.arange(5), 2), np.arange(50, 60), np.arange(10, 20))), dtype=[("k", int), ("a", int), ("b", int), ("c", int)], ) b = np.array( list(zip(np.tile([10, 11], 5), np.repeat(np.arange(5), 2), np.arange(65, 75), np.arange(0, 10))), dtype=[("k", int), ("a", int), ("b", int), ("c", int)], ) control = np.array( [ (10, 0, 50, 65, 10, 0), (11, 0, 51, 66, 11, 1), (10, 1, 52, 67, 12, 2), (11, 1, 53, 68, 13, 3), (10, 2, 54, 69, 14, 4), (11, 2, 55, 70, 15, 5), (10, 3, 56, 71, 16, 6), (11, 3, 57, 72, 17, 7), (10, 4, 58, 73, 18, 8), (11, 4, 59, 74, 19, 9), ], dtype=[("k", int), ("a", int), ("b1", int), ("b2", int), ("c1", int), ("c2", int)], ) test = join_by(["a", "k"], a, b, r1postfix="1", r2postfix="2", jointype="inner") assert_equal(test.dtype, control.dtype) assert_equal(test, control)
def test_outer_join(self): a, b = self.a, self.b test = join_by(('a', 'b'), a, b, 'outer') control = ma.array([(0, 50, 100, -1), (1, 51, 101, -1), (2, 52, 102, -1), (3, 53, 103, -1), (4, 54, 104, -1), (5, 55, 105, -1), (5, 65, -1, 100), (6, 56, 106, -1), (6, 66, -1, 101), (7, 57, 107, -1), (7, 67, -1, 102), (8, 58, 108, -1), (8, 68, -1, 103), (9, 59, 109, -1), (9, 69, -1, 104), (10, 70, -1, 105), (11, 71, -1, 106), (12, 72, -1, 107), (13, 73, -1, 108), (14, 74, -1, 109)], mask=[(0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 1, 0), (0, 0, 0, 1), (0, 0, 1, 0), (0, 0, 0, 1), (0, 0, 1, 0), (0, 0, 0, 1), (0, 0, 1, 0), (0, 0, 0, 1), (0, 0, 1, 0), (0, 0, 1, 0), (0, 0, 1, 0), (0, 0, 1, 0), (0, 0, 1, 0), (0, 0, 1, 0)], dtype=[('a', int), ('b', int), ('c', int), ('d', int)]) assert_equal(test, control)
def GetTileDefs(args, strtype='|S12'): #t = esutil.io.read(args.tiles)[args.tilecol][0:2] t = esutil.io.read(args.tiles)[args.tilecol] tindex = np.arange(len(t)) tiles = np.empty(len(t), dtype=[('tilename', strtype), ('index', np.int64)]) tiles['tilename'] = t.astype(strtype) tiles['index'] = np.arange(len(t)) if args.density is not None: for tile in tiles['tilename']: outdir = os.path.join(args.outdir, tile) if not os.path.exists(outdir): os.makedirs(outdir) cur = desdb.connect() q = "select urall, uraur, udecll, udecur, tilename from coaddtile order by udecll desc, urall asc" arr = cur.quick(q, array=True) dt = arr.dtype.descr dt[-1] = ('tilename', strtype) dt = np.dtype(dt) newarr = np.empty(len(arr), dtype=dt) for i in range(len(arr.dtype.names)): name = arr.dtype.names[i] if i == 4: newarr[name] = arr[name].astype(strtype) else: newarr[name] = arr[name] tiles = rec.join_by('tilename', newarr, tiles, usemask=False) tiles = np.sort(tiles, order='index') return tiles
def test_leftouter_join(self): a, b = self.a, self.b test = join_by(("a", "b"), a, b, "leftouter") control = ma.array( [ (0, 50, 100, -1), (1, 51, 101, -1), (2, 52, 102, -1), (3, 53, 103, -1), (4, 54, 104, -1), (5, 55, 105, -1), (6, 56, 106, -1), (7, 57, 107, -1), (8, 58, 108, -1), (9, 59, 109, -1), ], mask=[ (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), ], dtype=[("a", int), ("b", int), ("c", int), ("d", int)], ) assert_equal(test, control)
def get_joined_array(self, arr1, arr2): # https://stackoverflow.com/questions/23500754/numpy-how-to-outer-join-arrays if (len(arr1) > 0): box1 = arr1 if len(arr2) > 0: box2 = arr2 else: box2 = [box1[0]] else: if len(arr2) > 0: box2 = arr2 box1 = [box2[0]] else: return None a3 = np.array(box1, dtype=[('col1', np.int8)]) a2 = np.array(box2, dtype=[('col1', np.int8)]) a1 = a3[0] result = a1 for a in (a2, a3): cols = list(set(result.dtype.names).intersection(a.dtype.names)) result = recfunctions.join_by(cols, result, a, jointype='outer') pr_fr_l = [] for item in result: pr_fr_l.append(item[0]) print(pr_fr_l) return pr_fr_l
def test_different_field_order(self): # gh-8940 a = np.zeros(3, dtype=[("a", "i4"), ("b", "f4"), ("c", "u1")]) b = np.ones(3, dtype=[("c", "u1"), ("b", "f4"), ("a", "i4")]) # this should not give a FutureWarning: j = join_by(["c", "b"], a, b, jointype="inner", usemask=False) assert_equal(j.dtype.names, ["b", "c", "a1", "a2"])
def merge_cort(data, cortisol_filename): cort_data = np.genfromtxt(cortisol_filename, dtype=None, names=True, delimiter='\t') names = list(cort_data.dtype.names) # Find all the columns in cort_data that have 'av' in their title # and not '_mask' drop_names = names[8:] cort_data = nprf.drop_fields(cort_data, drop_names, usemask=False, asrecarray=True) data = nprf.join_by('SubID', data, cort_data, jointype='leftouter', r1postfix='KW', r2postfix='KW2', usemask=False,asrecarray=True) # Bizzarely, the join_by function pads with the biggest numbers it can think of! # So we're going to replace everything over 999 with 999 for name in names[1:8]: data[name][data[name]>999] = 999 # Define a UsableCort field: 1 if ANY of the cortisol values are not 999 cort_array = np.vstack( [ data[name] for name in names[1:8]]) usable_cort_array = np.ones(cort_array.shape[1]) usable_cort_array[np.any(cort_array<>999, axis=0)] = 1 data = nprf.append_fields(base = data, names='UsableCort', data = usable_cort_array, usemask=False) return data
def test_no_r2postfix(self): # Basic test of join_by no_r2postfix a, b = self.a, self.b test = join_by("a", a, b, r1postfix="1", r2postfix="", jointype="inner") control = np.array( [ (0, 50, 65, 100, 100), (1, 51, 66, 101, 101), (2, 52, 67, 102, 102), (3, 53, 68, 103, 103), (4, 54, 69, 104, 104), (5, 55, 70, 105, 105), (6, 56, 71, 106, 106), (7, 57, 72, 107, 107), (8, 58, 73, 108, 108), (9, 59, 74, 109, 109), ], dtype=[("a", int), ("b1", int), ("b", int), ("c", int), ("d", int)], ) assert_equal(test, control)
def test_join_subdtype(self): # tests the bug in https://stackoverflow.com/q/44769632/102441 foo = np.array([(1, )], dtype=[('key', int)]) bar = np.array([(1, np.array([1, 2, 3]))], dtype=[('key', int), ('value', 'uint16', 3)]) res = join_by('key', foo, bar) assert_equal(res, bar.view(ma.MaskedArray))
def test_join(self): a, b = self.a, self.b # Fixme, this test is broken #test = join_by(('a', 'b'), a, b) #control = np.array([(5, 55, 105, 100), (6, 56, 106, 101), # (7, 57, 107, 102), (8, 58, 108, 103), # (9, 59, 109, 104)], # dtype=[('a', int), ('b', int), # ('c', int), ('d', int)]) #assert_equal(test, control) # Hack to avoid pyflakes unused variable warnings join_by(('a', 'b'), a, b) np.array([(5, 55, 105, 100), (6, 56, 106, 101), (7, 57, 107, 102), (8, 58, 108, 103), (9, 59, 109, 104)], dtype=[('a', int), ('b', int), ('c', int), ('d', int)])
def test_join_subdtype(self): # tests the bug in https://stackoverflow.com/q/44769632/102441 from numpy.lib import recfunctions as rfn foo = np.array([(1,)], dtype=[('key', int)]) bar = np.array([(1, np.array([1,2,3]))], dtype=[('key', int), ('value', 'uint16', 3)]) res = join_by('key', foo, bar) assert_equal(res, bar.view(ma.MaskedArray))
def test_join_subdtype(self): # tests the bug in https://stackoverflow.com/q/44769632/102441 from numpy.lib import recfunctions as rfn foo = np.array([(1, )], dtype=[("key", int)]) bar = np.array([(1, np.array([1, 2, 3]))], dtype=[("key", int), ("value", "uint16", 3)]) res = join_by("key", foo, bar) assert_equal(res, bar.view(ma.MaskedArray))
def test_join(self): a, b = self.a, self.b # Fixme, this test is broken # test = join_by(('a', 'b'), a, b) # control = np.array([(5, 55, 105, 100), (6, 56, 106, 101), # (7, 57, 107, 102), (8, 58, 108, 103), # (9, 59, 109, 104)], # dtype=[('a', int), ('b', int), # ('c', int), ('d', int)]) # assert_equal(test, control) # Hack to avoid pyflakes unused variable warnings join_by(("a", "b"), a, b) np.array( [(5, 55, 105, 100), (6, 56, 106, 101), (7, 57, 107, 102), (8, 58, 108, 103), (9, 59, 109, 104)], dtype=[("a", int), ("b", int), ("c", int), ("d", int)], )
def test_inner_join(self): # Basic test of join_by a, b = self.a, self.b test = join_by('a', a, b, jointype='inner') control = np.array([(5, 55, 65, 105, 100), (6, 56, 66, 106, 101), (7, 57, 67, 107, 102), (8, 58, 68, 108, 103), (9, 59, 69, 109, 104)], dtype=[('a', int), ('b1', int), ('b2', int), ('c', int), ('d', int)]) assert_equal(test, control)
def test_same_name_different_dtypes_key(self): a_dtype = np.dtype([('key', 'S5'), ('value', '<f4')]) b_dtype = np.dtype([('key', 'S10'), ('value', '<f4')]) expected_dtype = np.dtype([ ('key', 'S10'), ('value1', '<f4'), ('value2', '<f4')]) a = np.array([('Sarah', 8.0), ('John', 6.0)], dtype=a_dtype) b = np.array([('Sarah', 10.0), ('John', 7.0)], dtype=b_dtype) res = join_by('key', a, b) assert_equal(res.dtype, expected_dtype)
def test_same_name_different_dtypes_key(self): a_dtype = np.dtype([("key", "S5"), ("value", "<f4")]) b_dtype = np.dtype([("key", "S10"), ("value", "<f4")]) expected_dtype = np.dtype([("key", "S10"), ("value1", "<f4"), ("value2", "<f4")]) a = np.array([("Sarah", 8.0), ("John", 6.0)], dtype=a_dtype) b = np.array([("Sarah", 10.0), ("John", 7.0)], dtype=b_dtype) res = join_by("key", a, b) assert_equal(res.dtype, expected_dtype)
def test_two_keys_two_vars(self): a = np.array( list( zip( np.tile([10, 11], 5), np.repeat(np.arange(5), 2), np.arange(50, 60), np.arange(10, 20), )), dtype=[("k", int), ("a", int), ("b", int), ("c", int)], ) b = np.array( list( zip( np.tile([10, 11], 5), np.repeat(np.arange(5), 2), np.arange(65, 75), np.arange(0, 10), )), dtype=[("k", int), ("a", int), ("b", int), ("c", int)], ) control = np.array( [ (10, 0, 50, 65, 10, 0), (11, 0, 51, 66, 11, 1), (10, 1, 52, 67, 12, 2), (11, 1, 53, 68, 13, 3), (10, 2, 54, 69, 14, 4), (11, 2, 55, 70, 15, 5), (10, 3, 56, 71, 16, 6), (11, 3, 57, 72, 17, 7), (10, 4, 58, 73, 18, 8), (11, 4, 59, 74, 19, 9), ], dtype=[ ("k", int), ("a", int), ("b1", int), ("b2", int), ("c1", int), ("c2", int), ], ) test = join_by(["a", "k"], a, b, r1postfix="1", r2postfix="2", jointype="inner") assert_equal(test.dtype, control.dtype) assert_equal(test, control)
def test_subarray_key(self): a_dtype = np.dtype([('pos', int, 3), ('f', '<f4')]) a = np.array([([1, 1, 1], np.pi), ([1, 2, 3], 0.0)], dtype=a_dtype) b_dtype = np.dtype([('pos', int, 3), ('g', '<f4')]) b = np.array([([1, 1, 1], 3), ([3, 2, 1], 0.0)], dtype=b_dtype) expected_dtype = np.dtype([('pos', int, 3), ('f', '<f4'), ('g', '<f4')]) expected = np.array([([1, 1, 1], np.pi, 3)], dtype=expected_dtype) res = join_by('pos', a, b) assert_equal(res.dtype, expected_dtype) assert_equal(res, expected)
def test_padded_dtype(self): dt = np.dtype('i1,f4', align=True) dt.names = ('k', 'v') assert_(len(dt.descr), 3) # padding field is inserted a = np.array([(1, 3), (3, 2)], dt) b = np.array([(1, 1), (2, 2)], dt) res = join_by('k', a, b) # no padding fields remain expected_dtype = np.dtype([('k', 'i1'), ('v1', 'f4'), ('v2', 'f4')]) assert_equal(res.dtype, expected_dtype)
def augment_effects(self): """ Add the population frequency information to the effects. """ tmp_snp=nprec.append_fields(self.data.snp, "FREQ", None, dtypes=[(float,self.data.freq.shape[1])], usemask=False) tmp_snp["FREQ"]=self.data.freq new_effects=nprec.join_by(["CHR", "POS"], self.effects, tmp_snp, usemask=False, jointype="inner") self.effects=new_effects
def test_padded_dtype(self): dt = np.dtype("i1,f4", align=True) dt.names = ("k", "v") assert_(len(dt.descr), 3) # padding field is inserted a = np.array([(1, 3), (3, 2)], dt) b = np.array([(1, 1), (2, 2)], dt) res = join_by("k", a, b) # no padding fields remain expected_dtype = np.dtype([("k", "i1"), ("v1", "f4"), ("v2", "f4")]) assert_equal(res.dtype, expected_dtype)
def test_subarray_key(self): a_dtype = np.dtype([("pos", int, 3), ("f", "<f4")]) a = np.array([([1, 1, 1], np.pi), ([1, 2, 3], 0.0)], dtype=a_dtype) b_dtype = np.dtype([("pos", int, 3), ("g", "<f4")]) b = np.array([([1, 1, 1], 3), ([3, 2, 1], 0.0)], dtype=b_dtype) expected_dtype = np.dtype([("pos", int, 3), ("f", "<f4"), ("g", "<f4")]) expected = np.array([([1, 1, 1], np.pi, 3)], dtype=expected_dtype) res = join_by("pos", a, b) assert_equal(res.dtype, expected_dtype) assert_equal(res, expected)
def test_no_r2postfix(self): # Basic test of join_by no_r2postfix a, b = self.a, self.b test = join_by( 'a', a, b, r1postfix='1', r2postfix='', jointype='inner') control = np.array([(0, 50, 65, 100, 100), (1, 51, 66, 101, 101), (2, 52, 67, 102, 102), (3, 53, 68, 103, 103), (4, 54, 69, 104, 104), (5, 55, 70, 105, 105), (6, 56, 71, 106, 106), (7, 57, 72, 107, 107), (8, 58, 73, 108, 108), (9, 59, 74, 109, 109)], dtype=[('a', int), ('b1', int), ('b', int), ('c', int), ('d', int)]) assert_equal(test, control)
def test_leftouter_join(self): a, b = self.a, self.b test = join_by(('a', 'b'), a, b, 'leftouter') control = ma.array( [(0, 50, 100, -1), (1, 51, 101, -1), (2, 52, 102, -1), (3, 53, 103, -1), (4, 54, 104, -1), (5, 55, 105, -1), (6, 56, 106, -1), (7, 57, 107, -1), (8, 58, 108, -1), (9, 59, 109, -1)], mask=[(0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1)], dtype=[('a', int), ('b', int), ('c', int), ('d', int)]) assert_equal(test, control)
def join_by(self, r2, key, jointype='inner', r1postfix='1', r2postfix='2', defaults=None, asrecarray=False, asTable=True): """ Join arrays `r1` and `r2` on key `key`. The key should be either a string or a sequence of string corresponding to the fields used to join the array. An exception is raised if the `key` field cannot be found in the two input arrays. Neither `r1` nor `r2` should have any duplicates along `key`: the presence of duplicates will make the output quite unreliable. Note that duplicates are not looked for by the algorithm. INPUTS: key {str, seq} A string or a sequence of strings corresponding to the fields used for comparison. r2 [Table] Table to join with KEYWORDS: jointype [str] {'inner', 'outer', 'leftouter'} 'inner' : returns the elements common to both r1 and r2. 'outer' : returns the common elements as well as the elements of r1 not in r2 and the elements of not in r2. 'leftouter' : returns the common elements and the elements of r1 not in r2. r1postfix [str] String appended to the names of the fields of r1 that are present in r2 r2postfix [str] String appended to the names of the fields of r2 that are present in r1 defaults [dict] Dictionary mapping field names to the corresponding default values. asrecarray [bool] Whether to return a recarray or just a flexible-type ndarray. asTable [bool] Whether to return a Table (default). *Notes*: - The output is sorted along the key. - A temporary array is formed by dropping the fields not in the key for the two arrays and concatenating the result. This array is then sorted, and the common entries selected. The output is constructed by filling the fields with the selected entries. Matching is not preserved if there are some duplicates... """ #TODO: return a Table by default if asTable: asrecarray = True arr = recfunctions.join_by(key, self, r2, jointype=jointype, r1postfix=r1postfix, r2postfix=r2postfix, defaults=defaults, usemask=False, asrecarray=asrecarray) return arr
def merge(left_array: np.ndarray, right_array: np.ndarray, left_on: str, right_on: str, how: "{inner, outer, leftouter}" = "inner", left_postscript="_left", right_postscript="_right") -> np.ndarray: """ Multiple methods of merging data on unique columns. This method is not optimised and makes use of NumPy's recfunctions. This method achieves everything that can be done with Pandas' merge fucntion. :param left_array: np.ndarray, the left concatenating array. :param right_array: np.ndarray, the right concatenating array. :param left_on: str, the left unique column to merge on. :param right_on: str, the right unique column to merge on. :param how: {inner, outer, leftouter} str, If 'inner', returns the elements common to both r1 and r2. If 'outer', returns the common elements as well as the elements of r1 not in r2 and the elements of not in r2. If 'leftouter', returns the common elements and the elements of r1 not in r2. :param left_postscript: str, appended to the names of the fields of left_array that are present in right_array but absent of the key. :param right_postscript: str, appended to the names of the fields of right_array that are present in left_array but absent of the key. :return : np.ndarray, newly merged array. """ # DATA if how not in ["inner", "outer", "leftouter"]: raise ValueError( "how has to be set to either: 'inner','outer','leftouter'") if left_on != right_on: if left_on in right_array.dtype.names: right_array = drop(right_array, left_on) mapping = {right_on: left_on} # LOGIC right_array.dtype.names = [ mapping.get(word, word) for word in right_array.dtype.names ] return rfn.join_by(left_on, left_array, right_array, jointype=how, usemask=False, r1postfix=left_postscript, r2postfix=right_postscript)
def test_padded_dtype(self): dt = np.dtype('i1,f4', align=True) dt.names = ('k', 'v') assert_(len(dt.descr), 3) # padding field is inserted a = np.array([(1, 3), (3, 2)], dt) b = np.array([(1, 1), (2, 2)], dt) res = join_by('k', a, b) # no padding fields remain expected_dtype = np.dtype([ ('k', 'i1'), ('v1', 'f4'), ('v2', 'f4') ]) assert_equal(res.dtype, expected_dtype)
def build_utc_array(source, sink, start, end): source_prices = retrieve_node_data(source, start, end) sink_prices = retrieve_node_data(sink, start, end) source_data = [] for element in source_prices: source_data.append((element[0].replace(tzinfo=pytz.timezone('EST')), element[1], element[2], element[5])) sink_data = [] for element in sink_prices: sink_data.append((element[0].replace(tzinfo=pytz.timezone('EST')), element[1], element[2], element[5])) sink_dt = numpy.dtype([('time_id', 'S32'), ('sink_node_id', 'i8'), ('sink_rt_lmp', 'f8'), ('sink_da_lmp', 'f8')]) source_dt = numpy.dtype([('time_id', 'S32'), ('source_node_id', 'i8'), ('source_rt_lmp', 'f8'), ('source_da_lmp', 'f8')]) sink_array = numpy.array(sink_data, dtype=sink_dt) source_array = numpy.array(source_data, dtype=source_dt) joined = rfn.join_by('time_id', sink_array, source_array, jointype='inner', usemask=False) rt_congestion_rounded = numpy.round(joined['sink_rt_lmp'] - joined['source_rt_lmp'], 2) da_congestion_rounded = numpy.round(joined['sink_da_lmp'] - joined['source_da_lmp'], 2) profit_rounded = numpy.round(rt_congestion_rounded - da_congestion_rounded, 2) joined = rfn.append_fields(joined, 'rt_congestion', data=rt_congestion_rounded) joined = rfn.append_fields(joined, 'da_congestion', data=da_congestion_rounded) joined = rfn.append_fields(joined, 'profit', data=profit_rounded) return joined[['time_id', 'rt_congestion']]
def test_inner_join(self): # Basic test of join_by a, b = self.a, self.b test = join_by("a", a, b, jointype="inner") control = np.array( [ (5, 55, 65, 105, 100), (6, 56, 66, 106, 101), (7, 57, 67, 107, 102), (8, 58, 68, 108, 103), (9, 59, 69, 109, 104), ], dtype=[("a", int), ("b1", int), ("b2", int), ("c", int), ("d", int)], ) assert_equal(test, control)
def test_leftouter_join(self): a, b = self.a, self.b test = join_by(('a', 'b'), a, b, 'leftouter') control = ma.array([(0, 50, 100, -1), (1, 51, 101, -1), (2, 52, 102, -1), (3, 53, 103, -1), (4, 54, 104, -1), (5, 55, 105, -1), (6, 56, 106, -1), (7, 57, 107, -1), (8, 58, 108, -1), (9, 59, 109, -1)], mask=[(0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1), (0, 0, 0, 1)], dtype=[('a', int), ('b', int), ('c', int), ('d', int)]) assert_equal(test, control)
def CalcDistArray(inval): ''' Calculate Distance Array using the Generate Near Function. Called By: CalcDistancesLayer CalcDistanceLayerMultiple Calls: Arguments: inval = [UPConfig,layername] Returns: Distarray: [OBJECTID, distance, BaseGeom_id, attracter] ''' UPConfig = inval[0] layername = inval[1] gn_table = arcpy.GenerateNearTable_analysis( os.path.join(UPConfig['paths']['dbpath'], UPConfig['paths']['dbname'], UPConfig['BaseGeom_cent']), os.path.join(UPConfig['paths']['dbpath'], UPConfig['paths']['dbname'], layername), 'in_memory/temp_up_dist', "", "", "", "CLOSEST") # Convert gn_table to a Numpy Array gn_array = arcpy.da.TableToNumPyArray(gn_table, ['IN_FID', 'NEAR_DIST']) desc = arcpy.Describe( os.path.join(UPConfig['paths']['dbpath'], UPConfig['paths']['dbname'], UPConfig['BaseGeom_cent'])) oidfieldname = desc.OIDFieldName gn_array.dtype.names = str(oidfieldname), 'distance' bg_array = arcpy.da.TableToNumPyArray( os.path.join(UPConfig['paths']['dbpath'], UPConfig['paths']['dbname'], UPConfig['BaseGeom_cent']), [oidfieldname, UPConfig['BaseGeom_id']]) arr = rfn.join_by(oidfieldname, gn_array, bg_array, 'outer') arr = AddNumpyField(arr, [('attracter', '<a50')]) for ln in arr: ln['attracter'] = layername arcpy.Delete_management('in_memory/temp_up_dist') return (arr)
def MergeArrays(arrlist, joinfldname): ''' Merge a list of arrays into a single array based on a common field and return Called by: Arguments: arrlist: a list of numpy arrays joinfldname: the name of the field to use for the join ''' oarr = arrlist[0] for res in arrlist[1:]: #Logger("Merging: dist_{ts}_{att}".format(att=res[0],ts = res[1][0])) arr = res oarr = rfn.join_by(str(joinfldname), oarr, arr, 'outer') return (oarr)
def merge_cort(data, cortisol_filename): cort_data = np.genfromtxt(cortisol_filename, dtype=None, names=True, delimiter='\t') names = list(cort_data.dtype.names) # Find all the columns in cort_data that have 'av' in their title # and not '_mask' drop_names = names[8:] cort_data = nprf.drop_fields(cort_data, drop_names, usemask=False, asrecarray=True) data = nprf.join_by('SubID', data, cort_data, jointype='leftouter', r1postfix='KW', r2postfix='KW2', usemask=False, asrecarray=True) # Bizzarely, the join_by function pads with the biggest numbers it can think of! # So we're going to replace everything over 999 with 999 for name in names[1:8]: data[name][data[name] > 999] = 999 # Define a UsableCort field: 1 if ANY of the cortisol values are not 999 cort_array = np.vstack([data[name] for name in names[1:8]]) usable_cort_array = np.ones(cort_array.shape[1]) usable_cort_array[np.any(cort_array <> 999, axis=0)] = 1 data = nprf.append_fields(base=data, names='UsableCort', data=usable_cort_array, usemask=False) return data
def test_two_keys_two_vars(self): a = np.array(list(zip(np.tile([10, 11], 5), np.repeat(np.arange(5), 2), np.arange(50, 60), np.arange(10, 20))), dtype=[('k', int), ('a', int), ('b', int), ('c', int)]) b = np.array(list(zip(np.tile([10, 11], 5), np.repeat(np.arange(5), 2), np.arange(65, 75), np.arange(0, 10))), dtype=[('k', int), ('a', int), ('b', int), ('c', int)]) control = np.array([(10, 0, 50, 65, 10, 0), (11, 0, 51, 66, 11, 1), (10, 1, 52, 67, 12, 2), (11, 1, 53, 68, 13, 3), (10, 2, 54, 69, 14, 4), (11, 2, 55, 70, 15, 5), (10, 3, 56, 71, 16, 6), (11, 3, 57, 72, 17, 7), (10, 4, 58, 73, 18, 8), (11, 4, 59, 74, 19, 9)], dtype=[('k', int), ('a', int), ('b1', int), ('b2', int), ('c1', int), ('c2', int)]) test = join_by( ['a', 'k'], a, b, r1postfix='1', r2postfix='2', jointype='inner') assert_equal(test.dtype, control.dtype) assert_equal(test, control)
def _desc_dither_columns(data, dithers): logging.info('adding dithers') d = join_by('observationId', data, dithers, jointype='inner', defaults={ 'descDitheredRA': 0., 'descDitheredDec': 0., 'descDitheredRotTelPos': 0. }, usemask=False) # nm = [] # for nn in d.dtype.names: # if nn == 'observationId': # nm.append(nn) # else: # nm.append(nn[:-1]) # d.dtype.names = nm d['rotTelPos'] = d['descDitheredRotTelPos'] return d
def filter_effects_against_data(self): """ Take a dataset and filter out all the snps that are not in the dataset. Also flip the alleles so that the EFFECT alelles is the REF allele """ # First, filter alleles: new_effects=nprec.join_by(["CHR", "POS"], self.effects, self.data.snp[["CHR", "POS", "REF", "ALT"]], usemask=False, jointype="inner") print( "Removed "+str(len(self.effects)-len(new_effects))+ " effect SNPS not in data",file=sys.stderr) if not len(new_effects): raise Exception("No effect SNPs in reference data") if any(new_effects[("EFFECT")]=="N") or any(new_effects[("OTHER")]=="N"): raise Exception("Effects corrupted. Either .gwas file or frequency file is bad (possibly contains duplicates)") flipped=0 removed=0 for rec in new_effects: if rec["EFFECT"]==rec["REF"] and rec["OTHER"]==rec["ALT"]: pass elif rec["OTHER"]==rec["REF"] and rec["EFFECT"]==rec["ALT"]: flipped+=1 rec["OTHER"]=rec["ALT"] rec["EFFECT"]=rec["REF"] rec["BETA"]=-rec["BETA"] else: removed+=1 rec["EFFECT"]=rec["OTHER"]="N" new_effects=new_effects[new_effects["EFFECT"]!="N"] print( "Removed "+str(removed)+" non-matching alleles",file=sys.stderr) print( "Flipped "+str(flipped)+" alleles",file=sys.stderr) self.effects=new_effects[["CHR", "POS", "EFFECT", "OTHER", "BETA"]]
data_all = np.copy(data) #================================================================================================ result, indexes = np.unique(ngc_to_messier['ngc'], return_index=True) ngc_to_messier = ngc_to_messier[indexes] result, indexes = np.unique(data['ngc'], return_index=True) data = data[indexes] data = rfn.join_by('ngc', data, ngc_to_messier, jointype='leftouter', usemask=False) data = data[(data["type"] == "OC") | (data["type"] == "C+N") | (data["ngc"] == PLEIADES_MAGIC_ID) | (data["ngc"] == HYADES_MAGIC_ID) | (data["ngc"]==ORION_NEBULA_NGC)] data["messier"][data["ngc"] == PLEIADES_MAGIC_ID] = "M 45 (pleiades)" data["messier"][data["ngc"] == HYADES_MAGIC_ID] = "hyades" data["messier"][data["messier"] == "M 44"] = "M 44 (beehive)" data["messier"][data["messier"] == "M 42"] = "M 42 (orion nb)" #================================================================================================ data = np.sort(data, order=['messier']) #================================================================================================
result, indexes = np.unique(dt['ngc'], return_index=True) dt = dt[indexes] result, indexes = np.unique(nebula_distance['ngc'], return_index=True) nebula_distance = nebula_distance[indexes] dt = np.sort(dt, order=['ngc']) nebula_distance = np.sort(nebula_distance, order=['ngc']) #================================================================================================ dt = rfn.join_by('ngc', dt, nebula_distance, jointype='leftouter', usemask=False, defaults={'dist': 0}) result, indexes = np.unique(dt['messier'], return_index=True) dt = dt[indexes] #================================================================================================ fill_with_zeros = np.zeros(dt.size) dt = rfn.append_fields(dt, ['x', 'y', 'z'], data=[fill_with_zeros, fill_with_zeros, fill_with_zeros], usemask=False) dt["glong"] = np.radians(dt["glong"]) dt["glat"] = np.radians(dt["glat"]) dt["x"] = dt["dist"] * np.cos(dt["glat"]) * np.cos(dt["glong"])
ends = np.append(ends, ends[-1]+mod) else: starts = None ends = None starts, ends = suchyta_utils.mpi.Scatter(starts, ends) for i in range(len(starts)): z = fitsio.read(zzfile, ext=-1, rows=np.arange(starts[i],ends[i])) df = fitsio.FITS(dfile, 'r') print starts[i], ends[i] w = df[1].where('coadd_objects_id >= %i && coadd_objects_id <= %i'%(z['coadd_objects_id'][0],z['coadd_objects_id'][-1])) if len(w)==0: continue d = df[1][w] dz = rec.join_by('coadd_objects_id', d, z, usemask=False) dz = rec.append_fields(dz, 'field', [field]*len(dz)) file = os.path.join(outdir, '%i-%i.fits'%(starts[i],ends[i])) if os.path.exists(file): os.remove(file) f = fitsio.FITS(file, 'rw') f.write(dz) MPI.COMM_WORLD.Barrier() if MPI.COMM_WORLD.Get_rank()==0: outfile = os.path.join(outdir, '%s.fits'%(name)) if os.path.exists(outfile): os.remove(outfile) files = os.listdir(outdir) of = fitsio.FITS(outfile, 'rw')
dtype=[('hd', 'int'), ('con', 'S20')]) #================================================================================================ result, indexes = np.unique(data['hd'], return_index=True) data = data[indexes] result, indexes = np.unique(names['hd'], return_index=True) names = names[indexes] result, indexes = np.unique(constellations['hd'], return_index=True) constellations = constellations[indexes] data = rfn.join_by('hd', data, names, jointype='leftouter', usemask=False, defaults={'name': '?'}) data = rfn.join_by('hd', data, constellations, jointype='leftouter',usemask=False, defaults={'con': '?'}) fill_with_zeros = np.zeros(data.size) data = rfn.append_fields(data, ['x', 'y', 'z', 'dist'], [fill_with_zeros, fill_with_zeros, fill_with_zeros, fill_with_zeros], usemask=False) #================================================================================================ data = data[data["parallax"] != 0] data["parallax"] = np.absolute(data["parallax"])
import numpy as np import numpy.lib.recfunctions as rfn import pandas as pd #The logic in this file takes many the data files generated by IBCombineHistoricalData #"joins" them by datetime to create a file with one date-time column and numerous data columns if __name__ == "__main__": #MANUALLY SET THESE FILES FOR JOINING filesToJoin = [ "C:/Dropbox/CninSrc/JTS/TWS API/samples/Java/Data/CAD_BID.txt", "C:/Dropbox/CninSrc/JTS/TWS API/samples/Java/Data/CAD_ASK.txt" ] #MANUALLY SET THIS FOR OUTPUT FILE outFile = "C:/Dropbox/CninSrc/JTS/TWS API/samples/Java/Data/JOINED.txt" #MANUALLY SET THIS HEADER SO EASIER TO REMEMBER COLUMNS headerTxt = "Time,CAD_BID,CAD_ASK" joined = [] for file in filesToJoin: print "Handing: " + file data = np.genfromtxt(file, delimiter = ',', dtype=np.dtype([('time',np.long), ('price', np.float)])) print "Joining" if len(joined) == 0: joined = data else: joined = rfn.join_by('time', joined, data, jointype='inner', usemask=False) np.savetxt(outFile, joined, delimiter=',', fmt="%s", header=headerTxt, comments="")
def fetch_localizer_contrasts( contrasts, n_subjects=None, get_tmaps=False, get_masks=False, get_anats=False, data_dir=None, url=None, resume=True, verbose=1, ): """Download and load Brainomics Localizer dataset (94 subjects). "The Functional Localizer is a simple and fast acquisition procedure based on a 5-minute functional magnetic resonance imaging (fMRI) sequence that can be run as easily and as systematically as an anatomical scan. This protocol captures the cerebral bases of auditory and visual perception, motor actions, reading, language comprehension and mental calculation at an individual level. Individual functional maps are reliable and quite precise. The procedure is decribed in more detail on the Functional Localizer page." (see http://brainomics.cea.fr/localizer/) "Scientific results obtained using this dataset are described in Pinel et al., 2007" [1] Parameters ---------- contrasts: list of str The contrasts to be fetched (for all 94 subjects available). Allowed values are:: {"checkerboard", "horizontal checkerboard", "vertical checkerboard", "horizontal vs vertical checkerboard", "vertical vs horizontal checkerboard", "sentence listening", "sentence reading", "sentence listening and reading", "sentence reading vs checkerboard", "calculation (auditory cue)", "calculation (visual cue)", "calculation (auditory and visual cue)", "calculation (auditory cue) vs sentence listening", "calculation (visual cue) vs sentence reading", "calculation vs sentences", "calculation (auditory cue) and sentence listening", "calculation (visual cue) and sentence reading", "calculation and sentence listening/reading", "calculation (auditory cue) and sentence listening vs " "calculation (visual cue) and sentence reading", "calculation (visual cue) and sentence reading vs checkerboard", "calculation and sentence listening/reading vs button press", "left button press (auditory cue)", "left button press (visual cue)", "left button press", "left vs right button press", "right button press (auditory cue)", "right button press (visual cue)", "right button press", "right vs left button press", "button press (auditory cue) vs sentence listening", "button press (visual cue) vs sentence reading", "button press vs calculation and sentence listening/reading"} or equivalently on can use the original names:: {"checkerboard", "horizontal checkerboard", "vertical checkerboard", "horizontal vs vertical checkerboard", "vertical vs horizontal checkerboard", "auditory sentences", "visual sentences", "auditory&visual sentences", "visual sentences vs checkerboard", "auditory calculation", "visual calculation", "auditory&visual calculation", "auditory calculation vs auditory sentences", "visual calculation vs sentences", "auditory&visual calculation vs sentences", "auditory processing", "visual processing", "visual processing vs auditory processing", "auditory processing vs visual processing", "visual processing vs checkerboard", "cognitive processing vs motor", "left auditory click", "left visual click", "left auditory&visual click", "left auditory & visual click vs right auditory&visual click", "right auditory click", "right visual click", "right auditory&visual click", "right auditory & visual click vs left auditory&visual click", "auditory click vs auditory sentences", "visual click vs visual sentences", "auditory&visual motor vs cognitive processing"} n_subjects: int, optional The number of subjects to load. If None is given, all 94 subjects are used. get_tmaps: boolean Whether t maps should be fetched or not. get_masks: boolean Whether individual masks should be fetched or not. get_anats: boolean Whether individual structural images should be fetched or not. data_dir: string, optional Path of the data directory. Used to force data storage in a specified location. url: string, optional Override download URL. Used for test only (or if you setup a mirror of the data). resume: bool Whether to resume download of a partly-downloaded file. verbose: int Verbosity level (0 means no message). Returns ------- data: Bunch Dictionary-like object, the interest attributes are : - 'cmaps': string list Paths to nifti contrast maps - 'tmaps' string list (if 'get_tmaps' set to True) Paths to nifti t maps - 'masks': string list Paths to nifti files corresponding to the subjects individual masks - 'anats': string Path to nifti files corresponding to the subjects structural images References ---------- Pinel, Philippe, et al. "Fast reproducible identification and large-scale databasing of individual functional cognitive networks." BMC neuroscience 8.1 (2007): 91. """ if isinstance(contrasts, _basestring): raise ValueError("Contrasts should be a list of strings, but " 'a single string was given: "%s"' % contrasts) if n_subjects is None: n_subjects = 94 # 94 subjects available if (n_subjects > 94) or (n_subjects < 1): warnings.warn("Wrong value for 'n_subjects' (%d). The maximum " "value will be used instead ('n_subjects=94')") n_subjects = 94 # 94 subjects available # we allow the user to use alternatives to Brainomics contrast names contrast_name_wrapper = { # Checkerboard "checkerboard": "checkerboard", "horizontal checkerboard": "horizontal checkerboard", "vertical checkerboard": "vertical checkerboard", "horizontal vs vertical checkerboard": "horizontal vs vertical checkerboard", "vertical vs horizontal checkerboard": "vertical vs horizontal checkerboard", # Sentences "sentence listening": "auditory sentences", "sentence reading": "visual sentences", "sentence listening and reading": "auditory&visual sentences", "sentence reading vs checkerboard": "visual sentences vs checkerboard", # Calculation "calculation (auditory cue)": "auditory calculation", "calculation (visual cue)": "visual calculation", "calculation (auditory and visual cue)": "auditory&visual calculation", "calculation (auditory cue) vs sentence listening": "auditory calculation vs auditory sentences", "calculation (visual cue) vs sentence reading": "visual calculation vs sentences", "calculation vs sentences": "auditory&visual calculation vs sentences", # Calculation + Sentences "calculation (auditory cue) and sentence listening": "auditory processing", "calculation (visual cue) and sentence reading": "visual processing", "calculation (visual cue) and sentence reading vs " "calculation (auditory cue) and sentence listening": "visual processing vs auditory processing", "calculation (auditory cue) and sentence listening vs " "calculation (visual cue) and sentence reading": "auditory processing vs visual processing", "calculation (visual cue) and sentence reading vs checkerboard": "visual processing vs checkerboard", "calculation and sentence listening/reading vs button press": "cognitive processing vs motor", # Button press "left button press (auditory cue)": "left auditory click", "left button press (visual cue)": "left visual click", "left button press": "left auditory&visual click", "left vs right button press": "left auditory & visual click vs " + "right auditory&visual click", "right button press (auditory cue)": "right auditory click", "right button press (visual cue)": "right visual click", "right button press": "right auditory & visual click", "right vs left button press": "right auditory & visual click " + "vs left auditory&visual click", "button press (auditory cue) vs sentence listening": "auditory click vs auditory sentences", "button press (visual cue) vs sentence reading": "visual click vs visual sentences", "button press vs calculation and sentence listening/reading": "auditory&visual motor vs cognitive processing", } allowed_contrasts = list(contrast_name_wrapper.values()) # convert contrast names contrasts_wrapped = [] # get a unique ID for each contrast. It is used to give a unique name to # each download file and avoid name collisions. contrasts_indices = [] for contrast in contrasts: if contrast in allowed_contrasts: contrasts_wrapped.append(contrast) contrasts_indices.append(allowed_contrasts.index(contrast)) elif contrast in contrast_name_wrapper: name = contrast_name_wrapper[contrast] contrasts_wrapped.append(name) contrasts_indices.append(allowed_contrasts.index(name)) else: raise ValueError("Contrast '%s' is not available" % contrast) # It is better to perform several small requests than a big one because: # - Brainomics server has no cache (can lead to timeout while the archive # is generated on the remote server) # - Local (cached) version of the files can be checked for each contrast opts = {"uncompress": True} subject_ids = ["S%02d" % s for s in range(1, n_subjects + 1)] subject_id_max = subject_ids[-1] data_types = ["c map"] if get_tmaps: data_types.append("t map") rql_types = str.join(", ", ['"%s"' % x for x in data_types]) root_url = "http://brainomics.cea.fr/localizer/" base_query = ( "Any X,XT,XL,XI,XF,XD WHERE X is Scan, X type XT, " "X concerns S, " "X label XL, X identifier XI, " "X format XF, X description XD, " 'S identifier <= "%s", ' % (subject_id_max,) + 'X type IN(%(types)s), X label "%(label)s"' ) urls = [ "%sbrainomics_data_%d.zip?rql=%s&vid=data-zip" % (root_url, i, _urllib.parse.quote(base_query % {"types": rql_types, "label": c}, safe=",()")) for c, i in zip(contrasts_wrapped, contrasts_indices) ] filenames = [] for subject_id in subject_ids: for data_type in data_types: for contrast_id, contrast in enumerate(contrasts_wrapped): name_aux = str.replace(str.join("_", [data_type, contrast]), " ", "_") file_path = os.path.join("brainomics_data", subject_id, "%s.nii.gz" % name_aux) file_tarball_url = urls[contrast_id] filenames.append((file_path, file_tarball_url, opts)) # Fetch masks if asked by user if get_masks: urls.append( "%sbrainomics_data_masks.zip?rql=%s&vid=data-zip" % (root_url, _urllib.parse.quote(base_query % {"types": '"boolean mask"', "label": "mask"}, safe=",()")) ) for subject_id in subject_ids: file_path = os.path.join("brainomics_data", subject_id, "boolean_mask_mask.nii.gz") file_tarball_url = urls[-1] filenames.append((file_path, file_tarball_url, opts)) # Fetch anats if asked by user if get_anats: urls.append( "%sbrainomics_data_anats.zip?rql=%s&vid=data-zip" % (root_url, _urllib.parse.quote(base_query % {"types": '"normalized T1"', "label": "anatomy"}, safe=",()")) ) for subject_id in subject_ids: file_path = os.path.join("brainomics_data", subject_id, "normalized_T1_anat_defaced.nii.gz") file_tarball_url = urls[-1] filenames.append((file_path, file_tarball_url, opts)) # Fetch subject characteristics (separated in two files) if url is None: url_csv = "%sdataset/cubicwebexport.csv?rql=%s&vid=csvexport" % ( root_url, _urllib.parse.quote("Any X WHERE X is Subject"), ) url_csv2 = "%sdataset/cubicwebexport2.csv?rql=%s&vid=csvexport" % ( root_url, _urllib.parse.quote( "Any X,XI,XD WHERE X is QuestionnaireRun, " "X identifier XI, X datetime " "XD", safe="," ), ) else: url_csv = "%s/cubicwebexport.csv" % url url_csv2 = "%s/cubicwebexport2.csv" % url filenames += [("cubicwebexport.csv", url_csv, {}), ("cubicwebexport2.csv", url_csv2, {})] # Actual data fetching dataset_name = "brainomics_localizer" data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) fdescr = _get_dataset_descr(dataset_name) files = _fetch_files(data_dir, filenames, verbose=verbose) anats = None masks = None tmaps = None # combine data from both covariates files into one single recarray from numpy.lib.recfunctions import join_by ext_vars_file2 = files[-1] csv_data2 = np.recfromcsv(ext_vars_file2, delimiter=";") files = files[:-1] ext_vars_file = files[-1] csv_data = np.recfromcsv(ext_vars_file, delimiter=";") files = files[:-1] # join_by sorts the output along the key csv_data = join_by("subject_id", csv_data, csv_data2, usemask=False, asrecarray=True)[:n_subjects] if get_anats: anats = files[-n_subjects:] files = files[:-n_subjects] if get_masks: masks = files[-n_subjects:] files = files[:-n_subjects] if get_tmaps: tmaps = files[1::2] files = files[::2] return Bunch(cmaps=files, tmaps=tmaps, masks=masks, anats=anats, ext_vars=csv_data, description=fdescr)
def gc_txt2fits(fname_base): f = open('%s.txt' % fname_base, 'r') txt = f.read() f.close() col_fmt = [] col_len = [] col_fmt.append([('ID', 'S11'), ('Name', 'S12'), ('RAh', 'i1'), ('RAm', 'i1'), ('RAs', 'f4'), ('DECh', 'i1'), ('DECm', 'i1'), ('DECs', 'f4'), ('l', 'f4'), ('b', 'f4'), ('R_Sun', 'f4'), ('R_gc', 'f4'), ('X', 'f4'), ('Y', 'f4'), ('Z', 'f4')]) col_fmt.append([('ID', 'S11'), ('FeH', 'f4'), ('FeH_wt', 'i2'), ('EBV', 'f4'), ('VHB', 'f4'), ('DM_V', 'f4'), ('V_t', 'f4'), ('M_Vt', 'f4'), ('UB', 'f4'), ('BV', 'f4'), ('VR', 'f4'), ('RI', 'f4'), ('spt', 'S5'), ('ellip', 'f4')]) col_fmt.append([('ID', 'S11'), ('v_r', 'f4'), ('v_r_err', 'f4'), ('v_LSR', 'f4'), ('sig_v', 'f4'), ('sig_v_err', 'f4'), ('c', 'f4'), ('r_c', 'f4'), ('r_h', 'f4'), ('mu_V', 'f4'), ('rho_0', 'f4'), ('log_tc', 'f4'), ('log_th', 'f4')]) ''' comment_unit = {('ID': 'GLOBULAR CLUSTER ID'), ('Name': 'NAME OF GC'), ('RAh': 'RA HOUR'), ('RAm': 'RA MINUTE'), ('RAs': 'RA SECOND'), ('DECh': 'DEC HOUR'), ('DECm': 'DEC MINUTE'), ('DECs': 'DEC SECOND'), ('l': 'GALACTIC LONGITUDE'), ('b': 'GALACTIC LATITUDE'), ('R_Sun': 'DIST FROM SUN'), ('R_gc': 'DIST FROM GALACTIC CENTER'), ('X': 'CARTESIAN X DISP FROM GAL CENTER'), ('Y': 'CARTESIAN Y DISP FROM GAL CENTER'), ('Z': 'CARTESIAN Z DISP FROM GAL CENTER'), ('FeH': 'METALLICITY'), ('FeH_wt': 'WEIGHT OF FEH MEASUREMENT'), ('EBV': 'B-V EXCESS'), ('VHB': ''), ('DMV': 'DIST MODULUS FROM V BAND'), ('V_t': ''), ('M_Vt': ''), ('UB': 'U-B COLOR'), ('BV': 'B-V COLOR'), ('VR': 'V-R COLOR'), ('RI': 'R-I COLOR'), ('spt': 'INTEGRATED SPECTRAL TYPE'), ('ellip': ''), ('v_r': 'HELIOCENTRIC RADIAL VELOCITY'), ('v_r_err': 'UNCERTAINTY IN v_r'), ('v_LSR': 'RAD VEL RELATIVE TO LSR'), ('sig_v': 'CENTRAL VELOCITY DISP'), ('sig_v_err': 'UNCERTAINTY IN sig_v_err'), ('c': 'CONCENTRATION PARAMETER'), ('r_c': 'RADIUS OF CORE'), ('r_h': 'HALF-LIGHT RADIUS'), ('mu_V': 'V-BAND SURFACE BRIGHTNESS'), ('rho_0': 'SURFACE NUMBER DENSITY'), ('log_tc': 'CORE RELAXATION TIME'), ('log_th': 'MEDIAN RELAXATION TIME')} ''' col_len.append([11, 13, 3, 3, 7, 4, 3, 7, 8, 8, 6, 6, 6, 6, 5]) col_len.append([11, 7, 5, 5, 6, 6, 6, 7, 7, 6, 6, 6, 6, 5]) col_len.append([11, 8, 6, 8, 8, 7, 8, 8, 8, 7, 7, 7, 5]) formatted_txt = [] for i,s in enumerate(block_string_by_comments(txt)): rows = [] for line in s.splitlines(): # Ignore comments and blank lines line = line.lstrip() if len(line) == 0: continue elif line[0] == '#': continue # Read in columns of constant width cols = [] start = 0 ncols = 0 for c in col_len[i]: if start + c > len(line): break tmp = line[start:start+c].lstrip().rstrip() if tmp == '': tmp = 'NaN' cols.append(tmp) ncols += 1 start += c # Fill in missing columns at end for k in xrange(ncols, len(col_len[i])): cols.append('NaN') # Join columns, using tabs as delimiters rows.append('\t'.join(cols)) # Join rows, using endlines as delimiters formatted_txt.append('\n'.join(rows)) # Convert formatted strings into numpy record arrays d = [] for fmt,s in zip(col_fmt, formatted_txt): d.append(np.genfromtxt(StringIO(s), dtype=fmt, delimiter='\t')) # Merge record arrays by name out = join_by('ID', d[0], d[1], jointype='outer') out = join_by('ID', out, d[2], jointype='outer') out['Name'][out['Name'] == 'NaN'] = '' out['spt'][out['spt'] == 'NaN'] = '' # Output record array to FITS file ''' cols = [] cols.append(pyfits.Column(name='MU', format='%dD' % len(mu), array=mu)) for i, m in enumerate(maps): cols.append(pyfits.Column(name='A_R %d' % i, format='D', array=m)) tbhdu = pyfits.new_table(cols) tbhdu.header.update('NESTED', nest, 'Healpix ordering scheme.') tbhdu.header.update('NSIDE', hp.npix2nside(maps.shape[1]), 'Healpix nside parameter.') tbhdu.writeto(fname, clobber=True) ''' #hdu = [] #hdu.append(pyfits.PrimaryHDU(mu)) #for m in maps: # hdu.append(pyfits.ImageHDU(m)) #hdulist = pyfits.HDUList(hdu) #hdulist.writeto(fname, clobber=True) try: pyfits.writeto('%s.fits' % fname_base, out, clobber=False) except IOError, e: print e
def fetch(self, contrasts=None, n_subjects=None, get_tmaps=False, get_masks=False, get_anats=False, url=None, resume=True, force=False, verbose=1): if n_subjects is None: n_subjects = 94 # 94 subjects available if (n_subjects > 94) or (n_subjects < 1): warnings.warn("Wrong value for \'n_subjects\' (%d). The maximum " "value will be used instead (\'n_subjects=94\')") n_subjects = 94 # 94 subjects available if contrasts is None: contrasts = self.contrast_name_wrapper.values() elif isinstance(contrasts, _basestring): contrasts = [contrasts] allowed_contrasts = list(self.contrast_name_wrapper.values()) # convert contrast names contrasts_wrapped = [] # get a unique ID for each contrast. It is used to give a unique name to # each download file and avoid name collisions. contrasts_indices = [] for contrast in contrasts: if contrast in allowed_contrasts: contrasts_wrapped.append(contrast) contrasts_indices.append(allowed_contrasts.index(contrast)) elif contrast in self.contrast_name_wrapper: name = self.contrast_name_wrapper[contrast] contrasts_wrapped.append(name) contrasts_indices.append(allowed_contrasts.index(name)) else: raise ValueError("Contrast \'%s\' is not available" % contrast) # It is better to perform several small requests than a big one because: # - Brainomics server has no cache (can lead to timeout while the archive # is generated on the remote server) # - Local (cached) version of the files can be checked for each contrast opts = {'uncompress': True} subject_ids = ["S%02d" % s for s in range(1, n_subjects + 1)] subject_id_max = subject_ids[-1] data_types = ["c map"] if get_tmaps: data_types.append("t map") rql_types = str.join(", ", ["\"%s\"" % x for x in data_types]) root_url = "http://brainomics.cea.fr/localizer/" base_query = ("Any X,XT,XL,XI,XF,XD WHERE X is Scan, X type XT, " "X concerns S, " "X label XL, X identifier XI, " "X format XF, X description XD, " 'S identifier <= "%s", ' % (subject_id_max, ) + 'X type IN(%(types)s), X label "%(label)s"') urls = ["%sbrainomics_data_%d.zip?rql=%s&vid=data-zip" % (root_url, i, _urllib.parse.quote(base_query % {"types": rql_types, "label": c}, safe=',()')) for c, i in zip(contrasts_wrapped, contrasts_indices)] filenames = [] for subject_id in subject_ids: for data_type in data_types: for contrast_id, contrast in enumerate(contrasts_wrapped): name_aux = str.replace( str.join('_', [data_type, contrast]), ' ', '_') file_path = os.path.join( "brainomics_data", subject_id, "%s.nii.gz" % name_aux) file_tarball_url = urls[contrast_id] filenames.append((file_path, file_tarball_url, opts)) # Fetch masks if asked by user if get_masks: urls.append("%sbrainomics_data_masks.zip?rql=%s&vid=data-zip" % (root_url, _urllib.parse.quote(base_query % {"types": '"boolean mask"', "label": "mask"}, safe=',()'))) for subject_id in subject_ids: file_path = os.path.join( "brainomics_data", subject_id, "boolean_mask_mask.nii.gz") file_tarball_url = urls[-1] filenames.append((file_path, file_tarball_url, opts)) # Fetch anats if asked by user if get_anats: urls.append("%sbrainomics_data_anats.zip?rql=%s&vid=data-zip" % (root_url, _urllib.parse.quote(base_query % {"types": '"normalized T1"', "label": "anatomy"}, safe=',()'))) for subject_id in subject_ids: file_path = os.path.join( "brainomics_data", subject_id, "normalized_T1_anat_defaced.nii.gz") file_tarball_url = urls[-1] filenames.append((file_path, file_tarball_url, opts)) # Fetch subject characteristics (separated in two files) if url is None: url_csv = ("%sdataset/cubicwebexport.csv?rql=%s&vid=csvexport" % (root_url, _urllib.parse.quote("Any X WHERE X is Subject"))) url_csv2 = ("%sdataset/cubicwebexport2.csv?rql=%s&vid=csvexport" % (root_url, _urllib.parse.quote("Any X,XI,XD WHERE X is QuestionnaireRun, " "X identifier XI, X datetime " "XD", safe=',') )) else: url_csv = "%s/cubicwebexport.csv" % url url_csv2 = "%s/cubicwebexport2.csv" % url filenames += [("cubicwebexport.csv", url_csv, {}), ("cubicwebexport2.csv", url_csv2, {})] # Actual data fetching files = self.fetcher.fetch(filenames, resume=resume, force=force, verbose=verbose) anats = None masks = None tmaps = None # combine data from both covariates files into one single recarray from numpy.lib.recfunctions import join_by ext_vars_file2 = files[-1] csv_data2 = np.recfromcsv(ext_vars_file2, delimiter=';') files = files[:-1] ext_vars_file = files[-1] csv_data = np.recfromcsv(ext_vars_file, delimiter=';') files = files[:-1] # join_by sorts the output along the key csv_data = join_by('subject_id', csv_data, csv_data2, usemask=False, asrecarray=True)[:n_subjects] if get_anats: anats = files[-n_subjects:] files = files[:-n_subjects] if get_masks: masks = files[-n_subjects:] files = files[:-n_subjects] if get_tmaps: tmaps = files[1::2] files = files[::2] return Bunch(cmaps=files, tmaps=tmaps, masks=masks, anats=anats, ext_vars=csv_data)