def fqframe(fileh): final_schema = Schema({ 'id': str, 'seq': str, 'quality': str, 'qual_ints': check_np_type('int64'), 'error': check_np_type('float64'), 'description': str }) #get_object = _id index = ['id'] columns = ('id', 'seq', 'quality', 'description', 'qual_ints', 'error') SANGER = True get_id = attr('id') get_seq = compose(str, attr('seq')) get_qual_ints = compose_all(np.array, itemgetter('phred_quality'), attr('_per_letter_annotations')) get_description = attr('description') get_quality = SeqIO.QualityIO._get_sanger_quality_str get_error = compose(error, get_qual_ints) #get_error = error_from_ints(get_qual_ints) getters = [ get_id, get_seq, get_quality, get_description, get_qual_ints, get_error ] assert len(getters) == len(columns) metadata = {'filename': fileh.name} iterator = get_fastq(fileh) get_raw_record = partial(next, iterator) # def get_row(record): # #record = next(fileh) ## import sys ## __module__ = sys.modules[__name__] ## get_getter = compose(attr, "get_{0}".format) ## _getters = map(get_getter, columns) ## self_getters = apply_each(_getters, __module__) #fzip(_getters, repeat(__module__, clen)) # results = apply_each(self_getters, record) # final_dict = dict(zip(columns, results)) # final_schema.validate(final_dict) # return final_dict # def load_fastq(): # fq = get_fastq(fileh) # dicts = map(get_row, fq) # return pd.DataFrame(dicts).set_index(index) #, index=index, columns=columns) #jreturn nameddict( return { 'obj_func': get_raw_record, 'columns': columns, 'getters': getters, 'validator': final_schema, 'dictgetters': None }
def fqframe(fileh): final_schema = Schema({ 'id' : str, 'seq' : str, 'quality' : str, 'qual_ints' : check_np_type('int64'), 'error' : check_np_type('float64'), 'description' : str }) #get_object = _id index = ['id'] columns = ('id', 'seq', 'quality', 'description', 'qual_ints', 'error') SANGER = True get_id = attr('id') get_seq= compose(str, attr('seq')) get_qual_ints = compose_all(np.array, itemgetter('phred_quality'), attr('_per_letter_annotations')) get_description = attr('description') get_quality = SeqIO.QualityIO._get_sanger_quality_str get_error = compose(error, get_qual_ints) #get_error = error_from_ints(get_qual_ints) getters = [get_id, get_seq, get_quality, get_description, get_qual_ints, get_error] assert len(getters) == len(columns) metadata = {'filename' : fileh.name} iterator = get_fastq(fileh) get_raw_record = partial(next, iterator) # def get_row(record): # #record = next(fileh) ## import sys ## __module__ = sys.modules[__name__] ## get_getter = compose(attr, "get_{0}".format) ## _getters = map(get_getter, columns) ## self_getters = apply_each(_getters, __module__) #fzip(_getters, repeat(__module__, clen)) # results = apply_each(self_getters, record) # final_dict = dict(zip(columns, results)) # final_schema.validate(final_dict) # return final_dict # def load_fastq(): # fq = get_fastq(fileh) # dicts = map(get_row, fq) # return pd.DataFrame(dicts).set_index(index) #, index=index, columns=columns) #jreturn nameddict( return { 'obj_func' : get_raw_record, 'columns' : columns, 'getters' : getters, 'validator' : final_schema, 'dictgetters' : None }
def fqframe(fileh): final_schema = Schema({ 'id': str, 'seq': str, 'quality': str, 'qual_ints': check_np_type('int64'), 'error': check_np_type('float64'), 'description': str }) #get_object = _id index = ['id'] columns = ('id', 'seq', 'quality', 'description', 'qual_ints', 'error') SANGER = True get_id = attr('id') get_seq = compose(str, attr('seq')) get_qual_ints = compose_all(np.array, itemgetter('phred_quality'), attr('_per_letter_annotations')) get_description = attr('description') get_quality = SeqIO.QualityIO._get_sanger_quality_str get_error = compose(error, get_qual_ints) #get_error = error_from_ints(get_qual_ints) def get_row(record): #record = next(fileh) print(get_funcs()) import sys __module__ = sys.modules[__name__] get_getter = compose(attr, "get_{0}".format) _getters = map(get_getter, columns) self_getters = apply_each( _getters, __module__) #fzip(_getters, repeat(__module__, clen)) results = apply_each(self_getters, record) final_dict = dict(zip(columns, results)) final_schema.validate(final_dict) return final_dict def load_fastq(): fq = get_fastq(fileh) dicts = map(get_row, fq) return pd.DataFrame(dicts).set_index( index) #, index=index, columns=columns) return namedtuple('FastqFrame', ['get_row', 'load_fastq'])( get_row, load_fastq) #{'get_row' : get_row, 'load_fastq' : load_fastq}
def fqframe(fileh): final_schema = Schema({ 'id' : str, 'seq' : str, 'quality' : str, 'qual_ints' : check_np_type('int64'), 'error' : check_np_type('float64'), 'description' : str }) #get_object = _id index = ['id'] columns = ('id', 'seq', 'quality', 'description', 'qual_ints', 'error') SANGER = True get_id = attr('id') get_seq= compose(str, attr('seq')) get_qual_ints = compose_all(np.array, itemgetter('phred_quality'), attr('_per_letter_annotations')) get_description = attr('description') get_quality = SeqIO.QualityIO._get_sanger_quality_str get_error = compose(error, get_qual_ints) #get_error = error_from_ints(get_qual_ints) def get_row(record): #record = next(fileh) print(get_funcs()) import sys __module__ = sys.modules[__name__] get_getter = compose(attr, "get_{0}".format) _getters = map(get_getter, columns) self_getters = apply_each(_getters, __module__) #fzip(_getters, repeat(__module__, clen)) results = apply_each(self_getters, record) final_dict = dict(zip(columns, results)) final_schema.validate(final_dict) return final_dict def load_fastq(): fq = get_fastq(fileh) dicts = map(get_row, fq) return pd.DataFrame(dicts).set_index(index) #, index=index, columns=columns) return namedtuple('FastqFrame', ['get_row', 'load_fastq'])(get_row, load_fastq)#{'get_row' : get_row, 'load_fastq' : load_fastq}
def walk(G, vstd, cycle, start, current=None, call=0): #TODO: I think this leaves out the final step of the cycle. if start == current: return vstd, cycle# + tuple([current]) #NOTE: checking for boolean of 0 is bad here haha #_current = start if current else current _current = start if current is None else current candidates = set(G.edges(_current)) - vstd #candidates = filterfalse(vstd.__contains__, G.neighbors(current)) edge = random.choice(tuple(candidates)) nn = edge[1] return walk(G, vstd | set([edge]), cycle + tuple([nn]), start, nn, call+1) filterfst = compose(next, ifilter) def edges_of_path(G, p): return map(X[0]['kmer'], starmap(F(G.get_edge_data), slider(p, 2))) reconstruct_str = compose_all(''.join, pmap(''.join), edges_of_path) def e_cycle(G, vstd=set(), cycle=(), call=0): ''' find a Eulerian path in a graph by iteratively expanding a cycle. requires a mostly-balanced and connected graph.''' if len(vstd) == len(G.edges()): return cycle def valid(N): edges=G.edges(N) return not (set(edges) <= vstd) #return bool(map(F(filterfalse, vstd.__contains__), edges)) if not cycle: valid_start = random.choice(G.nodes()) # 6 cycle = tuple([valid_start]) else: valid_start = filterfst(valid, cycle)
import re import pandas as pd from bioframes import to_np_int, sanger_qual_str_to_error from itertools import groupby from func import pmap, psplit, pstrip, compose, compose_all, merge_dicts, fzip, partial2, dictmap, starcompose from operator import itemgetter from functools import partial import operator as op from schema import Schema, Use from itertools import ifilter # Parse options #from pyparsing import Regex parse_array = compose_all(to_np_int, psplit(','), pstrip('[]')) tabsplit = psplit('\t') basic_scheme={ 'QNAME' : str, 'FLAG' : int, 'RNAME' : str, 'POS' : int, 'MAPQ' : int, 'CIGAR' : str, 'RNEXT' : str, 'PNEXT' : int, 'TLEN' : int, #'MRNM' : str, #'MRNM' : '*='.__contains__, #'MPOS' : int, #'ISIZE' : int, 'SEQ' : str,
print( func.__name__) print( args, kwargs) #print formatAllArgs(args, kwargs) return func(*args, **kwargs) return wrap def slider(seq, window, start=0):#, stop=None): '''assert list(slider([0, 1, 2], 2)) == [ [0,1], [1,2] ] assert list(slider('ABCDE', 4)) == [ 'ABCD', 'BCDE' ] assert list(slider('ABCDE', 1)) == list('ABCDE')''' N = len(seq) for idx in xrange(N-window+1): yield seq[idx:idx+window] filterfst = compose(next, ifilter) composition = compose_all(sorted, list, slider) def fromstr(_in): lines = filter(str.strip, _in.split('\n')) k = int(lines[0]) s = ''.join(lines[1:]) return s, k cfromstr = starcompose(composition, fromstr) #assert ["AATCC", "ATCCA", "CAATC", "CCAAC", "TCCAA"] == cfromstr(r_in) #neighobrs = filter(X[:k] == sfx, prefixg) #NOTE: using generators over lists makes a huge difference. def make_ovrlp_graph(kmers): N = len(kmers) ov = len(kmers[0]) - 1 M = np.zeros((N, N))
# return ddist(centers).argmin() # #return min(centers, key=ddist) def makematrices(s): _centers, _data = splitby(_not(isin('------')), ifilter(bool, s)) #centers = map(makenp, islice(_centers, 1, None)) #data = map(makenp, islice(_data, 1, None)) centers = makenp(islice(_centers, 1, None)) data = makenp(islice(_data, 1, None)) return centers, data isin = partial(methodcaller, '__contains__') makearray = compose_all(np.array, pmap(np.array), pmap(float), psplit(' ')) makenp = compose(np.array, pmap(makearray)) def get_in_out(s): raw_in, raw_out = splitby(_not(isin('Output')), ifilter(bool, s)) k = int(next(raw_in).split(' ')[0]) _in = makenp(raw_in) _out =makenp(islice(raw_out, 1, None)) return _in, _out, k lines = open('Lloyd.txt').readlines() input, expected, k = get_in_out(lines) print soft_k_means_cluster(input, k=3) from matplotlib import pyplot
# method that gets node with matching distance def get_match_dst(D, j, dist): assert dist != 0 return (D[j] == dist).argmax() #return D[i, (D[i] == dist)] def get_match_dists(D, j, dist): assert dist != 0 return (D[j] == dist).nonzero() def non_diag(D, j): return range(0, j) + range(j+1, D.shape[0]) nondiag_products = compose_all(list, get_products, non_diag) nondiag_products3 = compose_all(list, partial(get_products, times=3), non_diag) products3 = compose_all(list, partial(get_products, times=3)) def additive_phyloZ(D, n): if n == 2: return str_matrix(D) ll = limb_len(D, n) non_diag = range(0, n) + range(n+1, D.shape[0]) D[non_diag, j] -= ll D[j, non_diag] -= ll # get matching i, n, k D.mask[n] = D.mask[:, n] = True T = additive_phylo(D, n-1) v_candidates_i = get_match_dst(T, i, x) along_path = lambda c: D[k, c] + D[i, c] == x v = filterfst(along_path, v_candidates_i)