예제 #1
0
def parse_cigar(cigar_str):
    #? makes the regex not be too greedy
    cigar_regex = r'(?:([0-9]+)([MIDNSHPX=]))+?'
    reg = re.compile(cigar_regex)
    tups = reg.findall(cigar_str)
    key, value = itemgetter(1), itemgetter(0)
    groups = groupby(sorted(tups, key=key), key)
    get_counts = pmap(compose(int, itemgetter(0)))
    sum_counts = compose(sum, get_counts)
    s = "cigar_{0}".format
    cigar_dict = dict( (s(name), sum_counts(nums)) for name, nums in groups)
    #print cigar_dict
    mismatches = sum(num for k, num in cigar_dict.items() if k not in ['cigar_M', 'cigar_='])
    return merge_dicts(cigar_dict, {'cigar_score': mismatches})
예제 #2
0
def parseM(raw):
    '''parse & return a space-seperated matrix.'''
    _in = filter(bool, raw.split('\n'))
    return  np.matrix(map(pmap(float), map(str.split, _in)))
예제 #3
0
def walk(G, vstd, cycle, start, current=None, call=0):
    #TODO: I think this leaves out the final step of the cycle.
    if start == current: return vstd, cycle# + tuple([current])
    #NOTE: checking for boolean of 0 is bad here haha
    #_current = start if current else current
    _current = start if current is None else current
    candidates = set(G.edges(_current)) - vstd
    #candidates = filterfalse(vstd.__contains__, G.neighbors(current))
    edge = random.choice(tuple(candidates))
    nn = edge[1]
    return walk(G,  vstd | set([edge]), cycle + tuple([nn]), start, nn, call+1)

filterfst = compose(next, ifilter)
def edges_of_path(G, p):
    return map(X[0]['kmer'], starmap(F(G.get_edge_data), slider(p, 2)))
reconstruct_str = compose_all(''.join, pmap(''.join), edges_of_path)

def e_cycle(G, vstd=set(), cycle=(), call=0):
    ''' find a Eulerian path in a graph by iteratively expanding a cycle.
    requires a mostly-balanced and connected graph.'''
    if len(vstd) == len(G.edges()): return cycle

    def valid(N):
        edges=G.edges(N)
        return not (set(edges) <= vstd)
        #return bool(map(F(filterfalse, vstd.__contains__), edges))
    if not cycle:
        valid_start = random.choice(G.nodes()) # 6
        cycle = tuple([valid_start])
    else:
        valid_start = filterfst(valid, cycle)
예제 #4
0
파일: dist.py 프로젝트: averagehat/biolearn
#    return ddist(centers).argmin()
#    #return min(centers, key=ddist)


def makematrices(s):
    _centers, _data = splitby(_not(isin('------')), ifilter(bool, s))
    #centers = map(makenp, islice(_centers, 1, None))
    #data = map(makenp, islice(_data, 1, None))
    centers = makenp(islice(_centers, 1, None))
    data = makenp(islice(_data, 1, None))
    return centers, data



isin = partial(methodcaller, '__contains__')
makearray = compose_all(np.array, pmap(np.array), pmap(float), psplit(' '))
makenp = compose(np.array, pmap(makearray))
def get_in_out(s):
    raw_in, raw_out = splitby(_not(isin('Output')), ifilter(bool, s))
    k = int(next(raw_in).split(' ')[0])
    _in = makenp(raw_in)
    _out =makenp(islice(raw_out, 1, None))
    return _in, _out, k


lines = open('Lloyd.txt').readlines()
input, expected, k = get_in_out(lines)
print soft_k_means_cluster(input, k=3)


from matplotlib import pyplot
예제 #5
0

eval_flag = compose(bool, op.and_)

def flag_dict(flag):
    return dict((meaning, eval_flag(bit, flag)) for bit, meaning in flag_meanings.items())
def split_list(A, idx):
    return A[:idx], A[idx:]

sam_columns = ("QNAME", "FLAG", "RNAME", "POS", "MAPQ", "CIGAR", "RNEXT", "PNEXT", "TLEN", "SEQ", "QUAL") #optiosn


#TODO: get_record function takes a filehandle and returns a single record via SeqIO, etc.
#So functions expect a dictionary I guess
#pass
parse_options = compose(dict, pmap(parse_option)) #, tabsplit)
#readfields = compose(tabsplit, next)
line_to_dict = compose_all(dict, partial(zip, sam_columns)) #, tabsplit)
validated_dict = compose(basic_schema.validate, line_to_dict)
fields_and_options = compose(partial2(split_list, len(sam_columns)), tabsplit)
parsers = partial(fzip, [validated_dict, parse_options])
parse_fields_and_options = compose(parsers, fields_and_options)
all_but_cigar_dict = starcompose(merge_dicts, parse_fields_and_options)
get_cigar_dict = compose(parse_cigar, itemgetter('CIGAR'))
get_flag_dict = compose(flag_dict, itemgetter('FLAG'))
get_error = compose(sanger_qual_str_to_error, itemgetter('QUAL'))

def load_sam(fh):
    dicts = map(get_row, ifilter(bool, fh.read().split('\n')))
    return pd.DataFrame(dicts)
#TODO: do we really need indices? it complicates querying i tlooks like maybe where plays better with them
예제 #6
0
'''
pcompose = partial(partial, compose)
error_from_ints = pcompose(error)
#sanger_qual_str_to_error = cmperror(qual_to_phreds)

'''
get_fastq = partial(SeqIO.parse, format='fastq')
get_fasta = partial(SeqIO.parse, format='fasta')
to_np_int = partial(np.array, dtype=int)
gccontent = compose(ilen, pifilter('GC'.__contains__))

minus33 = partial(add, -33)
qual_int_sanger = compose(minus33, ord)

''' Error = 10^-(Phred/10) '''
qual_to_phreds = compose(to_np_int, pmap(qual_int_sanger))
error = compose(partial(pow, 10), partial2(div, -10.0))
#don't need to map because numpy vectorizes it automatically
#TODO: handle non-sanger version
sanger_qual_str_to_error = compose(error, qual_to_phreds)




#SANGER_OFFSET = 33

'''
assert len(quality) == len(error) == len(phred_scores)
'''

예제 #7
0
'''
'''
pcompose = partial(partial, compose)
error_from_ints = pcompose(error)
#sanger_qual_str_to_error = cmperror(qual_to_phreds)

'''
get_fastq = partial(SeqIO.parse, format='fastq')
get_fasta = partial(SeqIO.parse, format='fasta')
to_np_int = partial(np.array, dtype=int)
gccontent = compose(ilen, pifilter('GC'.__contains__))

minus33 = partial(add, -33)
qual_int_sanger = compose(minus33, ord)
''' Error = 10^-(Phred/10) '''
qual_to_phreds = compose(to_np_int, pmap(qual_int_sanger))
error = compose(partial(pow, 10), partial2(div, -10.0))
#don't need to map because numpy vectorizes it automatically
#TODO: handle non-sanger version
sanger_qual_str_to_error = compose(error, qual_to_phreds)

#SANGER_OFFSET = 33
'''
assert len(quality) == len(error) == len(phred_scores)
'''

#validate = scheme.validate
#TODO: could make these validations match samtools spec
#TODO: Could treat options/cigar string as their own class with their own parsing and validation.

예제 #8
0
def parsematrix(raw):
    _in = filter(bool, raw.split('\n'))
    return  np.ma.array(map(pmap(int), map(str.split, _in)), mask=False)
예제 #9
0
from functools import partial
import operator as op
from operator import add, div
from schema import Schema, Use
from itertools import ifilter
# Parse options
#from pyparsing import Regex

to_np_int = partial(np.array, dtype=int)
parse_array = compose_all(to_np_int, psplit(','), pstrip('[]'))
tabsplit = psplit('\t')


minus33 = partial(add, -33)
qual_int_sanger = compose(minus33, ord)
qual_to_phreds = compose(to_np_int, pmap(qual_int_sanger))
error = compose(partial(pow, 10), partial2(div, -10.0))
#don't need to map because numpy vectorizes it automatically
#TODO: handle non-sanger version
sanger_qual_str_to_error = compose(error, qual_to_phreds)

basic_scheme={
    'QNAME' : str,
    'FLAG' : int,
    'RNAME' : str,
    'POS' : int,
    'MAPQ' : int,
    'CIGAR' : str,
    'RNEXT' : str,
    'PNEXT' : int,
    'TLEN' : int,
예제 #10
0
파일: sam.py 프로젝트: demis001/biopandas
{
    'A' : chr,
    'i' : int,
    'f' : float,
    'Z' : str,
    'H' : int, # hex
    'B' : parse_array
}

#parse cigar string
cigar_regex = r'(?:([0-9]+)([MIDNSHPX=]))+?'
reg = re.compile(cigar_regex)
tups = reg.findall('15S213M23S')
key,value = itemgetter(1), itemgetter(0)
groups = groupby(sorted(tups, key=key), key)
get_counts = pmap(compose(int, itemgetter(0)))
sum_counts = compose(sum, get_counts)
cigar_dict = dict( (name, sum_counts(nums)) for name, nums in groups)
mismatches = sum(num for key, num in cigar_dict.items() if key not in 'M=')

#dictmap(compose(sum, get_counts), dict(groups))
#sum(starmap(to_cigar, tups))

#dict(map(reverse, tups))
''' assert sum(itemgetter('M', 'I', 'S', '=', 'X')) == len(seq) == len(quality), \
    "cigar string M/I/S/=/X should sum to the length of the query sequence." '''

#TODO: parse flag
#TODO: handle empty cases (unmapped reads, *)

index = ['QNAME', 'POS', 'REF']