def __getitem__(self, idx): """Returns (pybedtools.Interval, labels) """ row = self.df.iloc[idx] interval = pybedtools.create_interval_from_list( [to_scalar(x) for x in row.iloc[:self.bed_columns]]) if self.ignore_targets or self.n_tasks == 0: labels = {} else: labels = row.iloc[self.bed_columns:].values.astype( self.label_dtype) return interval, labels
def __getitem__(self, idx): if self.fasta_extractor is None: self.fasta_extractor = FastaStringExtractor( self.fasta_file, use_strand=True, force_upper=self.force_upper) if self.vcf is None: self.vcf = MultiSampleVCF(self.vcf_file) if self.vcf_extractor is None: self.vcf_extractor = VariantSeqExtractor(self.fasta_file) entry = self.bed.iloc[idx] entry_id = entry["id"] entry_chr = entry["chr"] entry_pos = entry["pos"] entry_strand = entry["strand"] ref_exons = [] var_exons = [] exon_pos_strings = [] exon_var_strings = [] for exon in entry_pos: # We get the interval interval = pybedtools.Interval(to_scalar(entry_chr), to_scalar(exon[0]), to_scalar(exon[1]), strand=to_scalar(entry_strand)) exon_pos_strings.append("%s-%s" % (str(exon[0]), str(exon[1]))) # We get the reference sequence ref_seq = self.fasta_extractor.extract(interval) # We get the variants, insert them and also save them as metadata variants = list(self.vcf.fetch_variants(interval)) if len(variants) == 0: ref_exons.append(ref_seq) var_exons.append(ref_seq) else: var_seq = self.vcf_extractor.extract(interval, variants=variants, anchor=0, fixed_len=False) var_string = ";".join([str(var) for var in variants]) ref_exons.append(ref_seq) var_exons.append(var_seq) exon_var_strings.append(var_string) # Combine if entry_strand == "-": ref_exons.reverse() var_exons.reverse() ref_seq = "".join(ref_exons) var_seq = "".join(var_exons) pos_string = ";".join(exon_pos_strings) var_string = ";".join(exon_var_strings) return { "inputs": { "ref_seq": ref_seq, "alt_seq": var_seq, }, "metadata": { "id": entry_id, "chr": entry_chr, "exon_positions": pos_string, "strand": entry_strand, "variants": var_string } }