def sort_by_exp_level(input_seqs): """ Given an input file of sequences tab separated with their associated expression levels, sorts the lines of the file by expression level, with the highest levels at the top of the file. Args: ----- input_seqs (str) -- the absolute path of the input file containing sequences to be sorted by expression level. Returns: ----- sorted_df (pandas.DataFrame) -- a data frame where rows are sorted in descending order based on expression level. """ # Assertions assert isinstance(input_seqs, str), 'Path name for input file must be \ passed as a string.' assert os.path.exists(input_seqs), 'Input file does not exist.' # Functionality with smart_open(input_seqs, 'r') as f: line = check_valid_line(f.readline()) seq1, _ = separate_seq_and_el_data(line) line = check_valid_line(f.readline()) seq2, _ = separate_seq_and_el_data(line) exp_seq1 = 'number_of_seqs_in_file' exp_seq2 = 'length_of_each_sequence' if seq1 == exp_seq1 and seq2 == exp_seq2: skip = 2 else: skip = 0 # Import data into a pandas data frame df = pd.read_csv(input_seqs, sep='\t', names=['seq', 'el'], skiprows=skip) # Sort it based on expression level sorted_df = df.sort_values('el', ascending=False) sorted_df = sorted_df.reset_index() sorted_df = sorted_df.drop(columns='index') return sorted_df
def write_num_and_len_of_seqs_to_file(input_seqs): """ Prepends the number of sequences and the length of the sequences in an input file to the first 2 lines of the file. Assumes sequences have been processed so that all sequences have been padded to the same length. The first 2 lines of the input file will be in the following format after writing the info to the file: " number_of_seqs_in_file\t<###>\n length_of_each_sequence\t<$$$>\n " where '<###>' is the number of sequences in the file, and '<$$$>'is the length to which every sequence in the file is padded. Args: ----- input_seqs (str) -- the absolute path of the processed input sequences to extract information from. Returns: ----- None """ # Assertions assert isinstance(input_seqs, str), 'Absolute pathname must be passed\ as a string.' assert os.path.exists(input_seqs), 'Input file does not exist.' # Functionality num_seqs = get_seq_count(input_seqs) with smart_open(input_seqs, 'r') as f: line = check_valid_line(f.readline()) if line == 'skip_line': raise AssertionError('First line is not valid.') seq, _ = separate_seq_and_el_data(line) len_seqs = len(seq) # assumes all sequences padded to same length with smart_open(input_seqs, 'r+') as f: contents = f.read() with smart_open(input_seqs, 'w+') as f: line_to_append = 'number_of_seqs_in_file\t' + str(num_seqs) + '\n' line_to_append += 'length_of_each_sequence\t' + str(len_seqs) + '\n' if input_seqs.endswith('.gz'): line_to_append = line_to_append.encode() f.write(line_to_append + contents) return
def get_max_min_mode_length_of_seqs(input_seqs): """ Returns the maximum, minimum, and modal length of the sequences in a file containing input sequences. Args: ----- input_seqs (str) -- the absolute path of the file containing the input sequences and their expression levels, tab separated. Returns: ----- max_length (int) -- the length of the longest sequence in the input file. min_length (int) -- the length of the shortest sequence in the input file. modal_length (int) -- the most common sequence length of the sequences in the input file. """ # Assertions assert isinstance(input_seqs, str), 'Path name for input file must be \ passed as a string.' assert os.path.exists(input_seqs), 'Input file does not exist.' # Functionality infile = smart_open(input_seqs, 'r') seq_lengths = [] for line in infile: line = check_valid_line(line) if line == 'skip_line': continue seq, exp_level = separate_seq_and_el_data(line) seq_lengths.append(len(seq)) max_length = max(seq_lengths) min_length = min(seq_lengths) modal_length = max(set(seq_lengths), key=seq_lengths.count) # Close the input file. infile.close() return max_length, min_length, modal_length
def get_num_and_len_of_seqs_from_file(input_seqs): """ Returns the number of sequences and length of sequences in an input file. Assumes sequences have been processed so that all sequences have been padded to the same length, and that the file containing the process sequences have the first 2 lines in the following format: " number_of_seqs_in_file\t<###> length_of_each_sequence\t<$$$> " where '<###>' is the number of sequences in the file, and '<$$$>'is the length to which every sequence in the file is padded. Args: ----- input_seqs (str) -- the absolute path of the processed input sequences to extract information from. Returns: ----- num_seqs (int) -- the number of sequences in input_seqs. len_seqs (int) -- the length of the all the padded sequences in the input file. """ # Assertions assert isinstance(input_seqs, str), 'Absolute pathname must be passed\ as a string.' assert os.path.exists(input_seqs), 'Input file does not exist.' # Functionality with smart_open(input_seqs, 'r') as f: # Parse first line of file containing info about num of seqs in file first_line = check_valid_line(f.readline()) assert first_line != 'skip_line', 'Invalid first line of file. Must\ be of the form: "number_of_seqs_in_file\t<###>" where <###> is the\ number of sequences in the file.' token, num_seqs = separate_seq_and_el_data(first_line) if num_seqs % 1 != 0: raise ValueError('Number of sequences on first line must be\ an integer.') assert token == 'number_of_seqs_in_file', 'First line of the input\ file must be of the form: "number_of_seqs_in_file\t<###>" where\ <###> is the number of sequences in the file.' # Parse 2nd line of file containing info about length of seqs in file second_line = check_valid_line(f.readline()) assert second_line != 'skip_line', 'Invalid second line of file.\ Must be of the form: "length_of_each_sequence\t<###>" where <###> is\ the length of every sequence in the file.' token, len_seqs = separate_seq_and_el_data(second_line) if len_seqs % 1 != 0: raise ValueError('Sequence length on second line must be an\ integer.') assert token == 'length_of_each_sequence', 'Second line of the input\ file must be of the form: "length_of_each_sequence\t<###>" where\ <###> is the length of every sequence in the file. Assumes\ homogeneity and/or padding of sequences.' return num_seqs, len_seqs
def check_oligonucleotide_flanks(seq_infile, scaffold_type): """ Checks that all the oligonucleotide sequences in an input file consist of the same sequences that flank the variable 80-mer sequence. i.e. all sequences in the input file should be of the form: TGCATTTTTTTCACATC-(variable region)-GTTACGGCTGTT Whereas the input sequences measured in the Abf1TATA scaffold will be of the form: TCACGCAGTATAGTTC-(variable region)-GGTTTATTGTTTATAAAAA These flanking sequences are for in-lab sequencing purposes only, so can be discarded when the 80-mer variable sequences are inserted into the a scaffold sequence. Args: ----- seq_infile (str) -- the absolute path of the input file containing all of the oligonucleotide sequences to be checked, and their expression level values (tab separated). scaffold_type (str) -- the scaffold type (pTpA or Abf1TATA) in which the expression levels for the sequences in the input file were measured. Returns: ----- incorrect_lines (list) -- returns a list of line numbers for for sequences that contain incorrect flank sequences. """ # Assertions assert isinstance(seq_infile, str), 'Absolute pathname must be passed \ as a string.' assert isinstance(scaffold_type, str), 'Scaffold type must be passed as a\ string.' assert scaffold_type == 'pTpA' or scaffold_type == 'Abf1TATA', 'Scaffold \ type must be specified as either pTpA or Abf1TATA.' # Functionality if scaffold_type == 'pTpA': flank_A = 'TGCATTTTTTTCACATC' flank_B = 'GGTTACGGCTGTT' elif scaffold_type == 'Abf1TATA': flank_A = 'TCACGCAGTATAGTTC' flank_B = 'GGTTTATTGTTTATAAAAA' infile = smart_open(seq_infile, 'r') line_number = 0 incorrect_lines = [] for line in infile: line_number += 1 line = check_valid_line(line) if line == 'skip_line': continue seq, exp_level = separate_seq_and_el_data(line) if seq.startswith(flank_A) and seq.endswith(flank_B): pass else: incorrect_lines.append(line_number) return incorrect_lines
def pull_homogeneous_seqs(input_seqs, scaffold_type=None): """ Pulls all sequences of the modal length (i.e. 110 bp for pTpA-type sequences and 115 bp for Abf1TATA-type) from an input file and writes them into an output file. Args: ----- input_seqs (str) -- the absolute pathname of the input file containing all of the raw oligonucleotide sequences and their expression levels, tab separated. scaffold_type (str) -- the scaffold type (pTpA or Abf1TATA for which the modal length is known to be 110 and 115 respectively) in which the expression levels for the sequences in the input file were measured. If None, the modal length is calculated manually. Default: None. Returns: ----- absolute_path (str) -- the absolute pathname of the output file containing the sequences of modal length. """ # Assertions assert isinstance(input_seqs, str), ('Input file pathname must be a' 'string.') assert os.path.isfile(input_seqs), 'Input file does not exist!' assert isinstance(scaffold_type, (str, type(None))), 'Scaffold type must\ be passed as a string.' if isinstance(scaffold_type, str): assert scaffold_type == 'pTpA' or scaffold_type == 'Abf1TATA', 'Scaff\ type must be specified as either pTpA or Abf1TATA, or else\ unspecified (in which case it takes value of None).' # Functionality # Defining the path name of the output file. relative_path = 'example/' time_stamp = get_time_stamp() if scaffold_type is None: relative_path += ('other_scaffolds/' + time_stamp + '_homogeneous_seqs.txt') else: relative_path += (scaffold_type + '_data/' + time_stamp + '_' + scaffold_type + '_homogeneous_seqs.txt') absolute_path = os.path.join(ROOT_DIR, relative_path) # Open the input and output files. infile = smart_open(input_seqs, 'r') output_seqs = smart_open(absolute_path, 'w') # Retrieve modal length for sequences in input file. if scaffold_type == 'pTpA': modal_length = 110 elif scaffold_type == 'Abf1TATA': modal_length = 115 else: _, _, modal_length = get_max_min_mode_length_of_seqs(input_seqs) # Find seqs in input file w/ modal length and write them to output file for line in infile: line = check_valid_line(line) if line == 'skip_line': continue seq, exp_level = separate_seq_and_el_data(line) if len(seq) == modal_length: output_seqs.write(seq + '\t' + str(exp_level) + '\n') else: continue # Close the input and output files. infile.close() output_seqs.close() return absolute_path
def encode_sequences_with_method(input_seqs, method='One-Hot', scale_els=True, model_type='1DCNN', binarized_els=False): """ A wrapper function that encodes all of the sequences in an input file according to the specified method, and returns them in a numpy array, as well as returning the associated expression levels in a separate numpy array. Args: ----- input_seqs (str) -- absolute path of the file containing all of the input sequences to be encoded, tab-separated withtheir associated expression levels. The first line of the file must be the number of sequences in the file, of the format: "number_of_seqs_in_file\t<###>" where <###> is the number of sequences in the file. The second line in the file must be the length to which all sequences are padded, of the format: "length_of_each_sequence\t<###>" where <###> is the length of every sequence in the file. Assumes homogeneity and/or padding of sequences. method (str) -- the method by which the sequence should be encoded. Must choose from: 'One-Hot'. Default: 'One-Hot' scale_els (bool) -- if True (default), scales all of the expression levels in the output list exp_levels to between -1 and 1, corresponding to the min and max values respectively. model_type (str) -- the type of model being used. Controls the shape of the returned list that contains the encoded sequences. Must be one of: '1DCNN' (for 1D-convolutional net), '1DLOCCON' (for 1D-locally connected net), or 'LSTM' (for Long-Short-Term-Memory net). Returns: ----- encoded_seqs (numpy.ndarray) -- a list of all the sequences in the input file, encoded with the specified method. Each element (i.e. each encoded sequence) is of type list. Shape of this array depends on 'model_type'. For example, for an input file containing 10000 sequences, each of length 257, where the length of each base vector is 5 (corresponding to bases A,T,G,C,N), the output shapes of encoded_seqs for each model is as follows: '1DCONV' ===> (10000, 257, 5) '1DLOCCON' ===> (10000, 257, 5) 'LSTM' ===> (10000, 1, 1285) where 1285=257*5 exp_levels (numpy.ndarray) -- a list of all the expression levels associated with the sequences. Each element (i.e. each EL) is of type float. Values scaled to between -1 and 1 if argument 'scale_els=True'. abs_max_el (float) -- the maximum expression level value in the input file. Returned only if 'scale_els=True'. """ # Assertions assert isinstance(input_seqs, str), 'TypeError: Input file path must be \ passed as a string.' assert isinstance(method, str), 'TypeError: Specified method must be a \ a string.' assert method in METHODS, 'Must specify one the method of encoding the \ sequence. Choose one of: %s' % (METHODS) assert isinstance(scale_els, bool), 'scale_els argument must be passed\ as a bool.' assert isinstance(model_type, str), 'model_type argument must be passed\ as a string.' assert model_type in MODELS, 'Must specify model_type as one of the\ following: %s' % (MODELS) # Functionality # Open input file infile = smart_open(input_seqs, 'r') # Initialize output lists, preallocating dimensions for speed. num_seqs, len_seq = organize.get_num_and_len_of_seqs_from_file(input_seqs) encoded_seqs = np.zeros((int(num_seqs), int(len_seq), 5)).astype(int) exp_levels = np.zeros(int(num_seqs)) # Encode sequences line_number = -3 for line in infile: line_number += 1 if line_number < 0: continue # skip first 2 lines of the file line = check_valid_line(line) if line == 'skip_line': continue # skip line if not a valid line seq, exp_level = separate_seq_and_el_data(line) # Encode with One-Hot method if method == 'One-Hot': try: encoded_seq = one_hot_encode_sequence(seq) except Exception: raise AssertionError('Error on line %s' % (line_number)) # Encode with another method, i.e. embedding else: # Another encoding method will go here # encoded_seq = another_encoding_method(seq) pass # Assign encoded sequences and expression levels to output arrays encoded_seqs[line_number] = encoded_seq exp_levels[line_number] = exp_level # Close the input file infile.close() # Reshape array if needed as input to LSTM model if model_type == 'LSTM': encoded_seqs = encoded_seqs.reshape(int(num_seqs), -1) encoded_seqs = encoded_seqs.reshape(int(num_seqs), 1, (int(len_seq) * 5)) # Scale expression level values to between -1 and 1 if scale_els: abs_max_el = abs(max(exp_levels, key=abs)) # the absolute max value # numpy allows easy division of all elements at once exp_levels = exp_levels / abs_max_el else: # If no scaling required abs_max_el = None # If expression levels are binarized, convert them from float ---> int if binarized_els: exp_levels = exp_levels.astype(int) return encoded_seqs, exp_levels, abs_max_el
def remove_flanks_from_all_seqs(input_seqs, scaffold_type='pTpA'): """ Removes all of the flanking sequences from an input file of sequences and their expression levels (tab separated). Example input file: GSE104878_20160609_average_promoter_ELs_per_seq_pTpA_ALL. shuffled.txt.gz from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE104878 Args: ----- input_seqs (str) -- the absolute pathname of the file containing all of the input sequences and their expression levels (tab separated). scaffold_type (str) -- the scaffold type (pTpA or Abf1TATA) that the input sequences had their expression levels measured in. Returns: ----- out_abs_path (str) -- the absolute path for the output file containing all of the sequences with their flanks removed, along with their expression levels (tab separated). """ # Assertions assert isinstance(input_seqs, str), 'Input file pathname must be \ passed as a string.' assert os.path.exists(input_seqs), 'Input file does not exist.' assert isinstance(scaffold_type, str), 'Scaffold type must be passed \ as a string.' assert scaffold_type == 'pTpA' or scaffold_type == 'Abf1TATA', 'Input \ scaffold type must be either pTpA or Abf1TATA.' # Check that all of the flank sequences are the same in all # sequences in the input file. incorrect = organize.check_oligonucleotide_flanks(input_seqs, scaffold_type) assert len(incorrect) == 0, 'Not all sequences in input file have same \ flanking sequences. Error on line %s' % str(incorrect) # Functionality # Defining the pathname for the output file. time_stamp = get_time_stamp() # Get unique time stamp for file naming relative_path = ('example/' + scaffold_type + '_data/' + time_stamp + '_' + scaffold_type + '_seqs_flanks_removed.txt') absolute_path = os.path.join(ROOT_DIR, relative_path) # Opening the input and output files. infile = smart_open(input_seqs, 'r') outfile = smart_open(absolute_path, 'w') # Remove flanks and write data to output file. for line in infile: line = check_valid_line(line) if line == 'skip_line': continue seq, exp_level = separate_seq_and_el_data(line) deflanked_seq = remove_flanks_from_seq(seq, scaffold_type) outfile.write(deflanked_seq + '\t' + str(exp_level) + '\n') # Close the input and output files. infile.close() outfile.close() return absolute_path
def pad_sequences(input_seqs, pad_front=False, extra_padding=0): """ Pads sequences in an input file to the length of the longest sequence in the file, plus any extra padding if specified. Pads the sequences at either the front or the back, with 'N' characters. Args: ----- input_seqs (str) -- the absolute path of the input file containing the sequences to be padded and their associated expression levels, tab separated. pad_front (bool) -- If True, will add padding to the front of the sequences. If False (default) pads sequences at the end (i.e. the RHS of the sequences). extra_padding (int) -- The number of extra null bases to add onto the front/back of the sequence Returns: ----- absolute_path (str) -- the absolute path of the output file containing all of the padded sequences and their associated expression levels, tab separated. """ # Assertions assert isinstance(input_seqs, str), 'Pathname of input file must be \ passed as a string.' assert os.path.exists(input_seqs), 'File does not exist.' assert isinstance(pad_front, bool), 'The pad_front variable must be \ passed as a bool.' assert isinstance(extra_padding, int), 'The amount of extra padding must \ be passed as an integer.' assert extra_padding >= 0, 'The amount of extra padding must be passed as \ a non-negative integer.' # Functionality # Define and open the output file absolute_path = input_seqs.replace('.txt', '_padded.txt') outfile = smart_open(absolute_path, 'w') # Retrieve input sequences, pad them, and write them to output file max_length, _, _ = organize.get_max_min_mode_length_of_seqs(input_seqs) pad_length = max_length + extra_padding with smart_open(input_seqs) as f: for line in f: line = check_valid_line(line) if line == 'skip_line': continue seq, exp_level = separate_seq_and_el_data(line) difference = pad_length - len(seq) if difference == 0: # No need for padding padded_seq = seq else: # Need to pad padding_seq = 'P' * difference if pad_front: padded_seq = padding_seq + seq else: # pad the end of the sequence padded_seq = seq + padding_seq outfile.write(padded_seq + '\t' + str(exp_level) + '\n') # Close the output file outfile.close() return absolute_path
def insert_all_seq_into_one_scaffold(input_seqs, scaffold_type='pTpA'): """ Takes an input file containing N sequences and inserts them into a single scaffold sequence, outputting the N unique promoter sequences to an output file along with their expression levels (tab separated). Args: ----- input_seqs (str) -- the absolute path for the input file containing all the oligonucleotide sequences to be inserted into the single scaffold sequence. All sequences must be of the same length as the scaffold variable region. scaffold_type (str) -- the scaffold type (pTpA or Abf1TATA) that the input sequences had their expression levels measured in. Default: 'pTpA'. Returns: ----- absolute_path (str) -- the absolute path for the output file containing all of the complete promoter sequences (where each input sequence has been inserted into the scaffold sequence). """ # Assertions assert isinstance(input_seqs, str), 'TypeError: pathname for input file \ must be a string.' assert isinstance(scaffold_type, str), 'Scaffold type must be passed as \ a string.' assert scaffold_type == 'pTpA' or scaffold_type == 'Abf1TATA', 'Scaffold \ type must either be passed as "pTpA" or "Abf1TATA".' # Functionality time_stamp = get_time_stamp() # get time stamp for unique file naming relative_path = ('example/' + scaffold_type + '_data/' + time_stamp + '_' + scaffold_type + '_seqs_inserted_into_scaffold.txt') absolute_path = os.path.join(ROOT_DIR, relative_path) # Open input and output files infile = smart_open(input_seqs, 'r') outfile = smart_open(absolute_path, 'w') # Retrieve the scaffold sequence scaff_directory = 'example/' + scaffold_type + '_data/' scaff_rel_path = scaff_directory + scaffold_type + '_scaffold.txt' scaff_abs_path = os.path.join(ROOT_DIR, scaff_rel_path) scaff_file = smart_open(scaff_abs_path, 'r') scaffold = scaff_file.readline().replace('\n', '') # Insert sequences into scaffold and write data to output file for line in infile: line = check_valid_line(line) if line == 'skip_line': continue seq, exp_level = separate_seq_and_el_data(line) complete_seq = insert_seq_into_scaffold(seq, scaffold) outfile.write(complete_seq + '\t' + str(exp_level) + '\n') # Close the input, output, and scaffold files. infile.close() outfile.close() scaff_file.close() return absolute_path