def seq_to_seq_map(dict_or_file_or_dataframe): """Takes a dictionary (in which case it does not modify it) or a dataframe, or an excel file name and returns a dict mapping segment names to sequences. Columns must be PID and Sequence""" import pandas as pd #if is dictionary just return it if isinstance(dict_or_file_or_dataframe, dict): return dict_or_file_or_dataframe #if is file, load it to a dataframe if u.is_str(dict_or_file_or_dataframe): df = pd.read_excel(dict_or_file_or_dataframe) else: df = dict_or_file_or_dataframe #else it must be a dataframe. Do your magic:) df = df.dropna() #cleanup whitespace df.PID = df.PID.str.strip() peps = {} for n, pep in df.iterrows(): peps[pep.PID] = pep.Sequence.strip() return peps
def get_annotated_sequence(segments, seg_to_seq, linkers="GSGPG", N_tag="", C_tag=""): """Returns an anottated amino acid sequnce of the poylhedra. Linkers can be a single string or an array of string with the required number of linkers. N_tag, C_tag - are appended to the left and right side of the string""" seg_to_seq = seq_to_seq_map(seg_to_seq) N = len(segments) if u.is_str(linkers): linkers = [linkers] * (N - 1) assert len(linkers) == N - 1, ( "Length of linkers must be one less than the number of segments." + "Is {NL}, but should be {N}".format(NL=len(linkers), N=N)) max_seg_len = max([len(seg_to_seq[s]) for s in segments]) aa_segments = [ seg_to_seq[s].ljust(max_seg_len).replace('-', '') + "\t|" + s for s in segments ] lines = [N_tag] + list(u.roundrobin(aa_segments, linkers)) + [C_tag] lines = "\n".join(lines) return lines
def load_pairs(yaml_str): """Loads a yaml string and returns a list of pairs""" pairs = yaml.load(yaml_str) #split the pairs and chains if they are seperated by a colon for n in range(len(pairs)): #if it's a calumn split it if u.is_str(pairs[n]['pair']): pairs[n]['pair'] = pairs[n]['pair'].split(':') #strip whitespace pairs[n]['pair'] = [p.strip() for p in pairs[n]['pair']] #do the same for chains if u.is_str(pairs[n]['chains']): pairs[n]['chains'] = pairs[n]['chains'].split(':') #strip whitespace pairs[n]['chains'] = [c.strip() for c in pairs[n]['chains']] return pairs
def do_assignment_replacements(topology, assignments): """Replaces topology segments with replacemnt rules. The first occurance of a name is replaced witht he first element in the list""" if u.is_str(assignments): assignments = segment_assignments_to_dict(assignments) else: assignments = assignments.copy() keys = assignments.keys() ret = [] for s in topology: sl = s.upper() if sl in keys: ret.append(assignments[sl].pop(0)) else: ret.append(s) return ret
def test_is_str(): assert u.is_str("A string") assert u.is_str(u"A string") assert not u.is_str({}) assert not u.is_str([]) assert not u.is_str(1)