Пример #1
0
def generate(X, seqType, args):
    '''
    # Note-1: args.gGap --> 1, 2, 3
    # Note-2: gGap --> ('X', 'X')
    :param X:
    :param seqType:
    :param args:
    :return:
    '''

    elements = utils.sequenceElements(seqType)
    m2 = list(itertools.product(elements, repeat=2))
    m = m2

    # print(args.gGap)

    T = []
    for x in X:
        x = x[:args.terminusLength]
        t = []
        for i in range(1, args.gGap + 1, 1):
            V = utils.kmers(x, i + 2)
            # seqLength = len(x) - (i+2) + 1
            for gGap in m:
                # print(gGap[0], end='')
                # print('-'*i, end='')
                # print(gGap[1])
                # trackingFeatures.append(gGap[0] + '-' * i + gGap[1])
                C = 0
                for v in V:
                    if v[0] == gGap[0] and v[-1] == gGap[1]:
                        C += 1
                # print(C, end=',')
                t.append(C)
            #end-for
        #end-for
        t = np.array(t)
        # t = t.reshape(-1, 1)
        T.append(t)
    # end-for
    T = np.array(T)
    # print(T.shape)

    totalFeature = 0
    if seqType == 'DNA' or seqType == 'RNA':
        totalFeature = (4 * args.gGap * 4)
    else:
        if seqType == 'PROT':
            totalFeature = (20 * args.gGap * 20)
        else:
            None
    #end-if

    save.datasetSave(T, totalFeature, 'fg11')


#end-def
Пример #2
0
def log_ZS_analytic((matrix, mu, Ne)):
    """compute log_Z analytically"""
    acc = 0
    nu = Ne - 1
    L = len(matrix)
    for kmer in kmers(L):
        ep = score_seq(matrix, "".join(kmer))
        acc += (1 / (1 + exp(ep - mu)))**(Ne - 1)
    return log(acc)
Пример #3
0
def log_ZS_analytic((matrix, mu, Ne)):
    """compute log_Z analytically"""
    acc = 0
    nu = Ne - 1
    L = len(matrix)
    for kmer in kmers(L):
        ep = score_seq(matrix, "".join(kmer))
        acc += (1/(1+exp(ep-mu)))**(Ne-1)
    return log(acc)
Пример #4
0
def log_Zb_chem_pot_ref_dep(L, sigma, G, mu, upto=4):
    sites = kmers(L)
    scores = [sigma * (L - site.count("A")) for site in sites]
    Zs = [
        sum([
            exp(-sum(comb) - mu * k)
            for comb in itertools.combinations(scores, k)
        ]) for k in trange(upto)
    ]
    Z0 = sum(Zs)
    return log(G / (4**L) * Z0)
Пример #5
0
def generate(X, seqType, args):
    '''
    :param X:
    :param seqType:
    :param args:
    :return:
    '''

    if seqType == 'DNA' or seqType == 'RNA':
        p = [0] * (4 * 4)  # As we are working for g11
    else:
        if seqType == 'PROT':
            p = [0] * (20 * 20)  # As we are working for g11
        else:
            None

    # Trail: Merged
    elements = utils.sequenceElements(seqType)
    m = list(itertools.product(elements, repeat=2))

    T = []
    for x in X:
        merged = []
        x = x[:args.terminusLength]
        for i in range(1, args.gGap + 1):
            kmers = utils.kmers(x, 2 + i)  # g11 --> 2, gGap (g11+gGap)
            t = []
            require = (args.terminusLength - (2 + 1) + 1) - (len(x) -
                                                             (2 + i) + 1)
            for kmer in kmers:
                d = {''.join(_): 0 for _ in m}
                segment = kmer[0] + kmer[-1]
                d[segment] = 1
                t.append(list(d.values()))
                # break
            # break
            # print(v)
            if require > 0:
                for i in range(require):
                    t.append(p)
                # end-for
            else:
                None
            t = np.array(t)
            # print(t)
            merged.append(t)
            # print('------------------')
        # end-for
        T.append(np.concatenate((merged), axis=1))
    # end-for
    T = np.array(T)
    # print(T.shape)

    totalFeature = 0
    if seqType == 'DNA' or seqType == 'RNA':
        totalFeature = (4 * args.gGap * 4)
    else:
        if seqType == 'PROT':
            totalFeature = (20 * args.gGap * 20)
        else:
            None
    # end-if
    save.datasetSave(T, totalFeature, 'pg11')


#end-for
def compute_Z(model):
    k = len(model)
    L = int(1 + sqrt(1+8*k)/2)
    return sum(exp(score(model, "".join(kmer))) for kmer in kmers(L))
Пример #7
0
        ep = score_seq(matrix, "".join(kmer))
        acc += (1 / (1 + exp(ep - mu)))**(Ne - 1)
    return log(acc)


def log_ZM_analytic((matrix, mu, Ne), N):
    log_ZS = log_ZS_analytic((matrix, mu, Ne))
    return N * log_ZS


def log_Z_analytic((matrix, mu, Ne), N):
    """compute log_Z analytically"""
    acc = 0
    nu = Ne - 1
    L = len(matrix)
    for kmer in kmers(L):
        ep = score_seq(matrix, "".join(kmer))
        acc += (1 / (1 + exp(ep - mu)))**(Ne - 1)
    return N * log(acc)


def log_ZS_naive((matrix, mu, Ne), trials=1000):
    acc = 0
    nu = Ne - 1
    L = len(matrix)
    for i in xrange(trials):
        ep = score_seq(matrix, random_site(L))
        acc += (1 / (1 + exp(ep - mu)))**(Ne - 1)
    mean_Zs = acc / trials
    return L * log(4) + log(mean_Zs)
Пример #8
0
	gen_kmers <k> sample <sample name>

		or

	gen_kmers <k> genomes <opt:full>
"""
k = int(sys.argv[1])
version = sys.argv[2]

if version == 'sample':
    sample_name = sys.argv[3]
    filename = 'samples/%s.txt' % sample_name
    sample_kmers = kmer_store()
    with open(filename) as f:
        for read in f:
            read_kmers = kmers(read.strip(), k)
            for kmer in read_kmers:
                sample_kmers.update(kmer)

    output_filename = 'pickles/%s_kmers_%d.pickle' % (os.path.basename(
        os.path.normpath(filename)).replace('.txt', ''), k)
    with open(output_filename, 'w') as f:
        cPickle.dump(sample_kmers.kmers, f)

elif version == 'genomes':
    full = True if (len(sys.argv) == 4 and sys.argv[3] == 'full') else False
    kmer_spectra = defaultdict(lambda: [0] * 20)
    for index, genome_filename in enumerate(
            progress(
                filter(lambda x: x.endswith('.fna'), os.listdir('genomes')))):
        kmer_spectrum = {} if full else kmer_store()
Пример #9
0
def generate(X, seqType, args):
    '''
    :param X:
    :param seqType:
    :param args:
    :return:
    '''

    if seqType == 'DNA' or seqType == 'RNA':
        p = [0] * (4**args.kTuple)
    else:
        if seqType == 'PROT':
            p = [0] * (20**args.kTuple)
        else:
            None
    # print(p)
    # print(len(p))

    elements = utils.sequenceElements(seqType)
    m = list(itertools.product(elements, repeat=args.kTuple))

    terminusLength = args.terminusLength
    # print(terminusLength)

    T = []
    for x in X:
        # print(len(x))
        x = x[:terminusLength]
        # print(len(x))
        # print('-----------------')
        require = (terminusLength - args.kTuple + 1) - (len(x) - args.kTuple +
                                                        1)
        # print(require)
        t = []
        kmers = utils.kmers(x, args.kTuple)
        for kmer in kmers:
            d = {''.join(i): 0 for i in m}
            d[kmer] = 1
            t.append(list(d.values()))
        #end-for
        if require > 0:
            for i in range(require):
                t.append(p)
            #end-for
        else:
            None
        t = np.array(t)
        # print(t.shape)
        T.append(t)
        # print(t.shape)
    #end-for
    T = np.array(T)
    # print(T.shape)

    totalFeature = 0
    if seqType == 'DNA' or seqType == 'RNA':
        totalFeature = (4**args.kTuple)
    else:
        if seqType == 'PROT':
            totalFeature = (20**args.kTuple)
        else:
            None
    # end-if

    save.datasetSave(T, totalFeature, 'pkmer')


#end-def
Пример #10
0
	gen_kmers <k> sample <sample name>

		or

	gen_kmers <k> genomes <opt:full>
"""
k = int(sys.argv[1])
version = sys.argv[2]

if version == 'sample':
	sample_name = sys.argv[3]
	filename = 'samples/%s.txt' % sample_name
	sample_kmers = kmer_store()
	with open(filename) as f:
		for read in f:
			read_kmers = kmers(read.strip(), k)
			for kmer in read_kmers:
				sample_kmers.update(kmer)

	output_filename = 'pickles/%s_kmers_%d.pickle' % (os.path.basename(os.path.normpath(filename)).replace('.txt',''), k)
	with open(output_filename, 'w') as f:
		cPickle.dump(sample_kmers.kmers, f)

elif version =='genomes':
	full = True if (len(sys.argv) == 4 and sys.argv[3] == 'full') else False
	kmer_spectra = defaultdict(lambda:[0]*20)
	for index, genome_filename in enumerate(progress(filter(lambda x: x.endswith('.fna'), os.listdir('genomes')))):
		kmer_spectrum = {} if full else kmer_store()
		for kmer in kmers(nucleotides_fna('genomes/'+genome_filename), k):
			if full:
				kmer_spectrum[kmer] = kmer_spectrum[kmer]+1 if kmer in kmer_spectrum else 1
Пример #11
0
    L = len(matrix)
    for kmer in kmers(L):
        ep = score_seq(matrix, "".join(kmer))
        acc += (1/(1+exp(ep-mu)))**(Ne-1)
    return log(acc)

def log_ZM_analytic((matrix, mu, Ne), N):
    log_ZS = log_ZS_analytic((matrix, mu, Ne))
    return N * log_ZS
    
def log_Z_analytic((matrix, mu, Ne), N):
    """compute log_Z analytically"""
    acc = 0
    nu = Ne - 1
    L = len(matrix)
    for kmer in kmers(L):
        ep = score_seq(matrix, "".join(kmer))
        acc += (1/(1+exp(ep-mu)))**(Ne-1)
    return N * log(acc)

def log_ZS_naive((matrix, mu, Ne), trials=1000):
    acc = 0
    nu = Ne - 1
    L = len(matrix)
    for i in xrange(trials):
        ep = score_seq(matrix, random_site(L))
        acc += (1/(1+exp(ep-mu)))**(Ne-1)
    mean_Zs = acc / trials
    return L * log(4) + log(mean_Zs)

def log_ZM_naive((matrix, mu, Ne), N, trials=1000):
Пример #12
0
def compute_Z(model):
    k = len(model)
    L = int(1 + sqrt(1 + 8 * k) / 2)
    return sum(exp(score(model, "".join(kmer))) for kmer in kmers(L))