'''


#print 'subspace:'
#subSpace.display()

#print 'Measurements in set'
#print expandedSpace.listMeasurementsInSet()

# loop over measurements and load them into Measurements
fileName = 'None'
for msrmtInSet in expandedSpace.listMeasurementsInSet():
     if fileName != allMeasurements[msrmtInSet][0]:
	fileName = allMeasurements[msrmtInSet][0]
	rdatFileName = 'ETERNA'+fileName[3:12]+'.rdat'
        rdat = RDATFile()
        rdat.load(open('/home/qmac/projects/testdir/'+rdatFileName))
        offset=0
     constructs = rdat.constructs.values()[0]
    # pdb.set_trace()
     dsection = constructs.data[allMeasurements[msrmtInSet][1]]
     if dsection.annotations['sequence'][0] != expandedSpace.sequences[msrmtInSet]:
	print 'Error, sequences not the same!'
	sys.exit()
     seq=dsection.annotations['sequence'][0] 

     countZeros = 0
     rdatLength = len(dsection.values)
     for j in range(1,rdatLength):
	if dsection.values[-j] != 0.0:
	    break
示例#2
0
def get_restricted_RDATFile_and_plot_data(constructs, numresults, qdata, searchid, ssdict, check_structure_balance):
    rdat = RDATFile()
    all_values = []
    rmdb_ids = []
    values_min = float('Inf')
    values_max = float('-Inf')
    values_min_heatmap = float('Inf')
    values_max_heatmap = float('-Inf')
    messages = []
    paired_bins = []
    unpaired_bins = []
    paired_bin_anchors = []
    unpaired_bin_anchors = []
    paired_merged_data = []
    unpaired_merged_data = []
    cell_labels = []

    for k, c in enumerate(constructs):
        entry = RMDBEntry.objects.get(constructsection=c)
        seqpos = [int(i) for i in c.seqpos.strip('[]').split(',')]
        offset = int(c.offset)
        rseqpos_byquery = {}
        searchable_fields = {}
        seqpos_offset = min(seqpos) - offset - 1
        searchable_fields['sequence'] = ''.join([s for i, s in enumerate(c.sequence) if i + offset + 1 in seqpos])
        searchable_fields['structure'] = ''.join([s for i, s in enumerate(c.structure) if i + offset + 1 in seqpos])
        if 'all' in qdata:
            rseqposes = [seqpos]
        else:
            for field in qdata:
                if field in ('sequence', 'structure'):
                    matches = [range(m.start() + seqpos_offset, m.end() + seqpos_offset) for m in re.finditer(qdata[field], searchable_fields[field].upper())]
                    if check_structure_balance and field == 'structure':
                        field_seqpos = []
                        for match in matches:
                            if check_balance(''.join([c.structure[i] for i in match])):
                                field_seqpos.append([i + offset + 1 for i in match])
                    else:
                        field_seqpos = [[i + offset + 1 for i in match] for match in matches]
                    if 'motif' in rseqpos_byquery:
                        rseqpos_byquery['motif'] = [match for match in rseqpos_byquery['motif'] if match in field_seqpos]
                    else:
                        rseqpos_byquery['motif'] = field_seqpos
                if field == 'secstructelems':
                    rseqpos_byquery[field] = []
                    for elem in qdata[field]:
                        for poslist in ssdict[c.id][elem]:
                            rseqpos_byquery[field].append([i + offset + 1 for i in poslist])
            
            rseqposes = rseqpos_byquery.values()[0]
            for k, v in rseqpos_byquery.items():
                tmp = []
                for poslist1 in v:
                    for poslist2 in rseqposes:
                        poslist = []
                        for i in poslist1:
                            if i in poslist2:
                                if len(poslist) > 0 and i-1 in poslist or i+1 in poslist:
                                    poslist.append(i)
                                else:
                                    if len(poslist) > 0:
                                        tmp.append(poslist)
                                    poslist = []
                        if len(poslist) > 0:
                            tmp.append(poslist)
                rseqposes += tmp

        for secnum, rseqpos in enumerate(rseqposes):
            if len(rseqpos) > 0:
                rseqpos.sort()
                section = RDATSection()
                section.name = '%s:%s:%s' % (entry.rmdb_id, rseqpos[0], rseqpos[-1])
                section.offset = c.offset
                section.sequence = c.sequence
                section.structure = c.structure 
                section.annotations = {}
                section.xsel = []
                section.data = []
                section.mutpos = []
                section.data_types = []
                section.seqpos = rseqpos
                rdat.traces[section.name] = []
                rdat.xsels[section.name] = []
                rdat.values[section.name] = []
                rdat.errors[section.name] = []

                append_to_rdat = False
                for idx, datasection in enumerate(DataSection.objects.filter(construct_section=c)):
                    dsection = RDATSection()
                    parsedvalues = datasection.values.split(',')
                    dsection.values = [float(parsedvalues[seqpos.index(i)]) for i in rseqpos if i in seqpos]
                    valarray = array([float(p) for p in parsedvalues])
                    normvalarray = valarray#(valarray - valarray.mean())/valarray.std()
                    if len(dsection.values) == 0:
                        # No data on the required rseqpos, continue with next data
                        continue
                    else:
                        # We found at least one data section that has the required data, append the construct section to our rdat file
                        append_to_rdat = True
                    if len(datasection.errors) > 0:
                        parsederrors = datasection.errors.split(',')
                        dsection.errors = [float(parsederrors[seqpos.index(i)]) for i in rseqpos if i in seqpos]
                    else:
                        dsection.errors = []
                    if len(datasection.xsel) > 0:
                        parsedxsels = datasection.xsel.split(',')
                        dsection.xsel = [float(parsedxsels[seqpos.index(i)]) for i in rseqpos if i in seqpos]
                    else:
                        dsection.xsel = []

                    all_values.append([section.name + ':' + str(idx + 1)] + [normvalarray[seqpos.index(i)] for i in rseqpos if i in seqpos])
                    cell_labels.append([c.sequence[i - offset - 1] + c.structure[i - offset - 1] for i in rseqpos if i in seqpos])
                    if len(c.structure.strip()) > -1:
                        paired_merged_data += [normvalarray[seqpos.index(i)] for i in rseqpos if c.structure[i - offset -1] in ('(', ')') and i in seqpos]
                        unpaired_merged_data += [normvalarray[seqpos.index(i)] for i in rseqpos if c.structure[i - offset -1] == '.' and i in seqpos]
                    #values_min = min(values_min, min(dsection.values))
                    #values_max = max(values_max, max(dsection.values))
                    if datasection.trace:
                        dsection.traces = [float(d) for d in datasection.trace.split(',')]
                    else:
                        dsection.traces = []
                    if datasection.reads:
                        dsection.reads = [float(d) for d in datasection.reads.split(',')]
                    else:
                        dsection.reads = []
                    if append_to_rdat:
                        section.data.append(dsection)
                        rdat.traces[section.name].append(dsection.traces)
                        rdat.reads[section.name].append(dsection.reads)
                        rdat.values[section.name].append(dsection.values)
                        rdat.xsels[section.name].append(dsection.xsel)
                        rdat.errors[section.name].append(dsection.errors)
                        dsection.annotations = dict([(a.name, a.value) for a in DataAnnotation.objects.filter(section=datasection)])
                        rdat.constructs[section.name] = section
                        rmdb_ids.append(entry.rmdb_id)

    numallresults = len(all_values)
    rdat.loaded = True
    rdat.comments = 'Query results for %s in the Stanford RMDB on %s. Search id %s' % (qdata, datetime.datetime.now(), searchid)
    if len(rmdb_ids) > numresults:
        messages.append('Your query exceeded %s results, showing just the first %s' % (numresults, numresults))
    for v in all_values[:numresults]:
        values_min_heatmap = min(values_min_heatmap, min(v[1:]))
        values_max_heatmap = max(values_max_heatmap, max(v[1:]))
    if len(all_values) > 0:
        maxlen = max((len(row) for row in all_values[:numresults]))
    else:
        maxlen = 0
    for i in range(len(all_values)):
        if len(all_values[i]) < maxlen:
            all_values[i] += [float('NaN')]*(maxlen - len(all_values[i]))
    if len(rmdb_ids) > 0:
        values_max = 2
        values_min = -1
        paired_merged_data = array(paired_merged_data)
        unpaired_merged_data = array(unpaired_merged_data)
        paired_indices = logical_and(paired_merged_data >= values_min, paired_merged_data <= values_max)
        unpaired_indices = logical_and(unpaired_merged_data >= values_min, unpaired_merged_data <= values_max)
        if len(unpaired_merged_data) > 0:
            unpaired_bins, unpaired_bin_anchors = (x.tolist() for x in hist(unpaired_merged_data[unpaired_indices], 100)[:2])
        if len(paired_merged_data) > 0:
            paired_bins, paired_bin_anchors = (x.tolist() for x in hist(paired_merged_data[paired_indices], 100)[:2])
        row_length = len(all_values[0])
        render = True
    else:
        render = False
        row_length = 0
    return rdat, [['Position'] + [str(i+1) for i in range(row_length-1)]] + all_values[:numresults], cell_labels[:numresults], values_min, values_max, values_min_heatmap, values_max_heatmap, unpaired_bins, paired_bins, unpaired_bin_anchors, paired_bin_anchors, rmdb_ids[:numresults], messages, numallresults, render
示例#3
0
from matplotlib.pylab import *
from rdatkit.datahandlers import RDATFile
from rdatkit.view import VARNA
from rdatkit.secondary_structure import fold
from rdatkit.mapping import MappingData, normalize
from analysis import eigen_reactivities
import sys

rdat = RDATFile()
rdat.load(open(sys.argv[1]))
vals = array(rdat.values.values()[0])
for i in xrange(shape(vals)[0]):
    vals[i,:] = normalize(vals[i,:])
eigenrs = eigen_reactivities(vals)

matshow(vals)
#mshow(vals, cmap=get_cmap('Greys'), vmin=0, vmax=vals.mean(), aspect='auto', interpolation='nearest')
matshow(eigenrs)
#imshow(eigenrs, cmap=get_cmap('Greys'), vmin=eigenrs.min(), vmax=eigenrs.mean(), aspect='auto', interpolation='nearest')
show()
construct = rdat.constructs.values()[0]
for i, e in enumerate(eigenrs[:35]):
    sequence = construct.sequence
    md = MappingData(data=e, seqpos=[s - construct.offset - 1 for s in construct.seqpos])
    print fold(sequence, mapping_data=md)
    structure = fold(sequence, mapping_data=md)[0].dbn
    VARNA.cmd(sequence, structure, 'test_results/eigen_struct%s.png' % i)



示例#4
0
from matplotlib.pylab import *
from rdatkit.datahandlers import RDATFile
from rdatkit.view import VARNA
from rdatkit.secondary_structure import fold
from rdatkit.mapping import MappingData, normalize
from analysis import eigen_reactivities
import sys

rdat = RDATFile()
rdat.load(open(sys.argv[1]))
vals = array(rdat.values.values()[0])
for i in xrange(shape(vals)[0]):
    vals[i, :] = normalize(vals[i, :])
eigenrs = eigen_reactivities(vals)

matshow(vals)
#mshow(vals, cmap=get_cmap('Greys'), vmin=0, vmax=vals.mean(), aspect='auto', interpolation='nearest')
matshow(eigenrs)
#imshow(eigenrs, cmap=get_cmap('Greys'), vmin=eigenrs.min(), vmax=eigenrs.mean(), aspect='auto', interpolation='nearest')
show()
construct = rdat.constructs.values()[0]
for i, e in enumerate(eigenrs[:35]):
    sequence = construct.sequence
    md = MappingData(
        data=e, seqpos=[s - construct.offset - 1 for s in construct.seqpos])
    print fold(sequence, mapping_data=md)
    structure = fold(sequence, mapping_data=md)[0].dbn
    VARNA.cmd(sequence, structure, 'test_results/eigen_struct%s.png' % i)
示例#5
0
args = parser.parse_args()

fragtypes = ['all', 'helices', 'interiorloops', 'hairpins', 'dangles', 'bulges',\
        '2wayjunctions', '3wayjunctions', '4wayjunctions', '5wayjunctions', 'unpaired', 'edgepairs', 'internalpairs']
db = {}
dberrors = {}
dbidx = {}
for t in fragtypes:
    db[t] = []
    dberrors[t] = []
    dbidx[t] = {}
for filename in os.listdir(args.rdatdir):
    if not os.path.isdir(args.rdatdir+'/'+filename):
        print filename
    rdat = RDATFile()
    rdat.load(open(args.rdatdir+'/'+filename))
    for cname in rdat.constructs:
        construct = rdat.constructs[cname]
        struct = SecondaryStructure(construct.structure)
        frags = struct.explode()
        for data in construct.data:
            if (('mutation' not in data.annotations) or \
                    ('mutation' in data.annotations and \
                    'WT' in data.annotations['mutation'])):
                if 'modifier' in data.annotations:
                    if args.normalize:
                        normvals = normalize(data.values)
                    else:
                        normvals = data.values
                        iqr = scoreatpercentile(normvals, 75) - scoreatpercentile(normvals, 25)
示例#6
0
def parse_rdat_data(request, is_get_file):
    sequences, titles, structures, modifiers, messages, valerrors, offset_seqpos = (
        [], [], [], [], [], [], [])
    temperature = 37
    rdatfile = RDATFile()
    refstruct = secondary_structure.SecondaryStructure()

    if len(request.POST['sequences']):
        messages.append(
            'WARNING: Using sequences and/or structures from received RDAT file content. Original input in fields were overwritten.'
        )

    if is_get_file:
        uploadfile = request.FILES['rdatfile']
        rf = write_temp_file('/tmp/%s' % uploadfile.name)
    else:
        rmdbid = request.POST['rmdbid'].strip()
        version = RMDBEntry.get_current_version(rmdbid)
        rf = open(
            PATH.DATA_DIR['FILE_DIR'] + '/%s/%s_%s.rdat' %
            (rmdbid, rmdbid, version), 'r')
    rdatfile.load(rf)
    rf.close()

    is_modified = 'modifier' in rdatfile.annotations
    if is_modified:
        modifier = ','.join(rdatfile.annotations['modifier'])

    for cname in rdatfile.constructs:
        c = rdatfile.constructs[cname]

        if 'temperature' in c.annotations:
            temperature = c.annotations['temperature']

        seq = ''
        bonuses_1d = []
        bonuses_2d = []
        seqpos_min = min(c.seqpos)

        if ('clipsequence' in request.POST):
            if len(c.sequence) >= max(c.seqpos) - c.offset - 1:
                seq_clipped = ''.join(
                    [c.sequence[i - c.offset - 1] for i in sorted(c.seqpos)])
            else:
                messages.append(
                    'WARNING: SEQUENCE and SEQPOS mismatch for construct %s in RDAT file. SEQPOS ignored.'
                    % c.name)
                c.seqpos = [(i + 1) for i in range(len(c.sequence))]
                seq_clipped = c.sequence

            if len(c.structure) >= max(c.seqpos) - c.offset - 1:
                struct_clipped = ''.join(
                    [c.structure[i - c.offset - 1] for i in sorted(c.seqpos)])
            else:
                messages.append(
                    'WARNING: STRUCTURE and SEQPOS mismatch for construct %s in RDAT file. STRUCTURE ignored.'
                    % c.name)
                struct_clipped = '.' * (max(c.seqpos) - c.offset - 1)
                c.structure = struct_clipped

            seq = seq_clipped
            struct = struct_clipped
        else:
            seq = c.sequence
            struct = c.structure

        if len(refstruct) == 0:
            refstruct = secondary_structure.SecondaryStructure(dbn=struct)

        for d in c.data:
            if is_modified or ('modifier' in d.annotations):
                s = seq
                is_2d = False
                if ('mutation' in d.annotations):
                    for mut in d.annotations['mutation']:
                        if 'WT' == mut.strip():
                            break
                        is_2d = True
                        idx = int(mut.strip()[1:-1])
                        base = mut[-1]
                        s = s[:idx - c.offset] + base + s[idx - c.offset + 1:]
                    titles.append(';'.join(d.annotations['mutation']))
                else:
                    titles.append(cname)
                sequences.append(s)

                b = [str(x) for x in d.values]
                bonuses_1d.append(b)
                if ('clipsequence' in request.POST):
                    offset = seqpos_min
                    offset_seqpos.append([i - offset for i in c.seqpos])
                else:
                    offset = c.offset + 1
                    offset_seqpos.append([i - offset for i in c.seqpos])

                if is_2d:
                    if len(bonuses_2d) == 0:
                        bonuses_2d = zeros([len(seq), len(seq)])
                    for i, pos in enumerate(c.seqpos):
                        bonuses_2d[pos - offset, idx - offset] = d.values[i]

                if is_modified:
                    modifiers.append(modifier)
                else:
                    modifiers.append(','.join(d.annotations['modifier']))

    return (messages, valerrors, bonuses_1d, bonuses_2d, titles, modifiers,
            offset_seqpos, temperature, sequences, refstruct)
示例#7
0
def get_constructs_from_rdats(dir):
	"""
	using rdatkit parse all RDAT files in the directory specified and parse
	each construct's sequence, structure and score into construct objects.
	ONLY files with .rdat extension will be recognized as RDAT files other 
	files will be skipped

	:params dir: directory with rdat files 
	:type dir: str
	:returns: List of Construct Objects

	"""

	files = glob.glob(dir+"/*")
	rdat_files = []

	#make sure files are rdat files
	for file in files:
		if file[-4:] == "rdat":
			rdat_files.append(file)

	if len(rdat_files) == 0:
		raise ValueError("no rdat files in directory "+dir+" files must have rdat extension to be recognized")

	construct_objs = []
	mm = re.compile("Mutate and Map")

	for file in files:

		r = RDATFile()
		r.load(open(file))
		
		construct = r.constructs.values()
		constructs = construct[0].data

		for c in constructs:
			#some data entries dont have signal_to_noise variable, skip over 
			#them
			if 'signal_to_noise' not in c.annotations:
				continue
			data_quality = c.annotations['signal_to_noise']
			spl = re.split("\:",data_quality[0])

			#dont want to include weak data
			if spl[0] == "weak":
				continue

			name = c.annotations['MAPseq'][0]
			project_name = c.annotations['MAPseq'][1]

			#mutate and map data wont be useful since target structure is not 
			#correct with the mutation
			if mm.search(name) or mm.search(project_name):
				continue

			score = c.annotations['EteRNA'][0]
			spl1 = re.split("\:",score)

			c = Construct(seq=c.annotations['sequence'][0],ss=c.annotations['structure'][0],score=spl1[2])

			construct_objs.append(c)

	return construct_objs
示例#8
0
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import rdatkit.secondary_structure as ss
import sys
import pickle
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("infile",help="input file name, please end with .rdat")
parser.add_argument("outfile",help="output file name no extition needed")
args = parser.parse_args()


#import rdat data
rdat = RDATFile()
rdat.load(open('/home/qmac/projects/testdir/'+args.infile))
offset=0
constructs = rdat.constructs.values()[0]
competing_pairs = []
sequences_included=[]
msrmtsNumbers=[]
print 'lenth of constructs.data', len(constructs.data)

for count in range(0,len(constructs.data)):
     dsection = constructs.data[count]
     seq=dsection.annotations['sequence'][0]
     #structs=ss.fold(seq,nstructs=2)
     struct_energy_list =[(struct.dbn, energy) for struct, energy in zip(*ss.subopt(seq,nstructs=100,fraction=0.075,energies=True))]
     struct_energy_list_unique = list(set(struct_energy_list))
     struct_energy_list_unique = sorted(struct_energy_list_unique, key=lambda x: x[1])
示例#9
0
args = parser.parse_args()

fragtypes = ['all', 'helices', 'interiorloops', 'hairpins', 'dangles', 'bulges',\
        '2wayjunctions', '3wayjunctions', '4wayjunctions', '5wayjunctions', 'unpaired', 'edgepairs', 'internalpairs']
db = {}
dberrors = {}
dbidx = {}
for t in fragtypes:
    db[t] = []
    dberrors[t] = []
    dbidx[t] = {}
for filename in os.listdir(args.rdatdir):
    if not os.path.isdir(args.rdatdir + '/' + filename):
        print filename
    rdat = RDATFile()
    rdat.load(open(args.rdatdir + '/' + filename))
    for cname in rdat.constructs:
        construct = rdat.constructs[cname]
        struct = SecondaryStructure(construct.structure)
        frags = struct.explode()
        for data in construct.data:
            if (('mutation' not in data.annotations) or \
                    ('mutation' in data.annotations and \
                    'WT' in data.annotations['mutation'])):
                if 'modifier' in data.annotations:
                    if args.normalize:
                        normvals = normalize(data.values)
                    else:
                        normvals = data.values
                        iqr = scoreatpercentile(