def get_repertoire_contents(repertoire, compairr_params): attributes = [EnvironmentSettings.get_sequence_type().value, "counts"] attributes += [] if compairr_params.ignore_genes else ["v_genes", "j_genes"] repertoire_contents = repertoire.get_attributes(attributes) repertoire_contents = pd.DataFrame({**repertoire_contents, "identifier": repertoire.identifier}) check_na_rows = [EnvironmentSettings.get_sequence_type().value] check_na_rows += [] if compairr_params.ignore_counts else ["counts"] check_na_rows += [] if compairr_params.ignore_genes else ["v_genes", "j_genes"] n_rows_before = len(repertoire_contents) repertoire_contents.dropna(inplace=True, subset=check_na_rows) if n_rows_before > len(repertoire_contents): warnings.warn( f"CompAIRRHelper: removed {n_rows_before - len(repertoire_contents)} entries from repertoire {repertoire.identifier} due to missing values.") if compairr_params.ignore_counts: repertoire_contents["counts"] = 1 repertoire_contents.rename(columns={EnvironmentSettings.get_sequence_type().value: "junction_aa", "v_genes": "v_call", "j_genes": "j_call", "counts": "duplicate_count", "identifier": "repertoire_id"}, inplace=True) return repertoire_contents
def drop_illegal_character_sequences( dataframe: pd.DataFrame, import_illegal_characters: bool) -> pd.DataFrame: if not import_illegal_characters: sequence_type = EnvironmentSettings.get_sequence_type() sequence_name = sequence_type.name.lower().replace("_", " ") legal_alphabet = EnvironmentSettings.get_sequence_alphabet( sequence_type) if sequence_type == SequenceType.AMINO_ACID: legal_alphabet.append(Constants.STOP_CODON) is_illegal_seq = [ ImportHelper.is_illegal_sequence(sequence, legal_alphabet) for sequence in dataframe[sequence_type.value] ] n_illegal = sum(is_illegal_seq) if n_illegal > 0: dataframe.drop(dataframe.loc[is_illegal_seq].index, inplace=True) warnings.warn( f"{ImportHelper.__name__}: {n_illegal} sequences were removed from the dataset because their {sequence_name} sequence contained illegal characters. " ) return dataframe
def _build_new_sequence(self, sequence: ReceptorSequence, position, signal: dict) -> ReceptorSequence: gap_length = signal["motif_instance"].gap if "/" in signal["motif_instance"].instance: motif_left, motif_right = signal["motif_instance"].instance.split("/") else: motif_left = signal["motif_instance"].instance motif_right = "" gap_start = position+len(motif_left) gap_end = gap_start+gap_length part1 = sequence.get_sequence()[:position] part2 = sequence.get_sequence()[gap_start:gap_end] part3 = sequence.get_sequence()[gap_end+len(motif_right):] new_sequence_string = part1 + motif_left + part2 + motif_right + part3 annotation = SequenceAnnotation() implant = ImplantAnnotation(signal_id=signal["signal_id"], motif_id=signal["motif_id"], motif_instance=signal["motif_instance"], position=position) annotation.add_implant(implant) new_sequence = ReceptorSequence() new_sequence.set_annotation(annotation) new_sequence.set_metadata(copy.deepcopy(sequence.metadata)) new_sequence.set_sequence(new_sequence_string, EnvironmentSettings.get_sequence_type()) return new_sequence
def get_relevant_sequence_attributes(self): attributes = [EnvironmentSettings.get_sequence_type().value] if not self.compairr_params.ignore_genes: attributes += ["v_genes", "j_genes"] return attributes
def __init__(self, use_positional_info: bool, distance_to_seq_middle: int, flatten: bool, name: str = None): self.use_positional_info = use_positional_info self.distance_to_seq_middle = distance_to_seq_middle self.flatten = flatten if distance_to_seq_middle: self.pos_increasing = [ 1 / self.distance_to_seq_middle * i for i in range(self.distance_to_seq_middle) ] self.pos_decreasing = self.pos_increasing[::-1] else: self.pos_decreasing = None self.name = name if EnvironmentSettings.get_sequence_type( ) == SequenceType.NUCLEOTIDE: # todo check this / explain in docs self.distance_to_seq_middle = self.distance_to_seq_middle * 3 self.onehot_dimensions = self.ALPHABET + [ "start", "mid", "end" ] if self.use_positional_info else self.ALPHABET # todo test this
def _encode_repertoire(self, repertoire, params: EncoderParams): sequences = repertoire.get_attribute(EnvironmentSettings.get_sequence_type().value) onehot_encoded = self._encode_sequence_list(sequences, pad_n_sequences=self.max_rep_len, pad_sequence_len=self.max_seq_len) example_id = repertoire.identifier labels = self._get_repertoire_labels(repertoire, params) if params.encode_labels else None return onehot_encoded, example_id, labels
def get_sequence(self, sequence_type: SequenceType = None): """Returns receptor_sequence (nucleotide/amino acid) that corresponds to provided sequence type or preset receptor_sequence type from EnvironmentSettings class if no type is provided""" sequence_type_ = EnvironmentSettings.get_sequence_type() if sequence_type is None else sequence_type if sequence_type_ == SequenceType.AMINO_ACID: return self.amino_acid_sequence else: return self.nucleotide_sequence
def get_sequence(self): """ :return: receptor_sequence (nucleotide/amino acid) that corresponds to preset receptor_sequence type from EnvironmentSettings class """ if EnvironmentSettings.get_sequence_type() == SequenceType.AMINO_ACID: return self.amino_acid_sequence else: return self.nucleotide_sequence
def write_sequence_set_file(self, sequence_set, filename, offset=0): sequence_col = "junction_aa" if EnvironmentSettings.get_sequence_type( ) == SequenceType.AMINO_ACID else "junction" vj_header = "" if self.compairr_params.ignore_genes else "\tv_call\tj_call" with open(filename, "w") as file: file.write( f"{sequence_col}{vj_header}\tduplicate_count\trepertoire_id\n") for id, sequence_info in enumerate(sequence_set, offset): file.write("\t".join(sequence_info) + f"\t1\t{id}\n")
def _set_max_dims(self, dataset): max_rep_len = 0 max_seq_len = 0 for repertoire in dataset.repertoires: sequences = repertoire.get_attribute(EnvironmentSettings.get_sequence_type().value) max_rep_len = max(len(sequences), max_rep_len) max_seq_len = max(max([len(seq) for seq in sequences]), max_seq_len) self.max_rep_len = max_rep_len self.max_seq_len = max_seq_len