예제 #1
0
    def __getitem__(self, idx):
        """Returns (pybedtools.Interval, labels)
        """
        row = self.df.iloc[idx]
        interval = pybedtools.create_interval_from_list(
            [to_scalar(x) for x in row.iloc[:self.bed_columns]])

        if self.ignore_targets or self.n_tasks == 0:
            labels = {}
        else:
            labels = row.iloc[self.bed_columns:].values.astype(
                self.label_dtype)
        return interval, labels
예제 #2
0
    def __getitem__(self, idx):
        if self.fasta_extractor is None:
            self.fasta_extractor = FastaStringExtractor(
                self.fasta_file, use_strand=True, force_upper=self.force_upper)
        if self.vcf is None:
            self.vcf = MultiSampleVCF(self.vcf_file)
        if self.vcf_extractor is None:
            self.vcf_extractor = VariantSeqExtractor(self.fasta_file)

        entry = self.bed.iloc[idx]
        entry_id = entry["id"]
        entry_chr = entry["chr"]
        entry_pos = entry["pos"]
        entry_strand = entry["strand"]

        ref_exons = []
        var_exons = []
        exon_pos_strings = []
        exon_var_strings = []
        for exon in entry_pos:
            # We get the interval
            interval = pybedtools.Interval(to_scalar(entry_chr),
                                           to_scalar(exon[0]),
                                           to_scalar(exon[1]),
                                           strand=to_scalar(entry_strand))
            exon_pos_strings.append("%s-%s" % (str(exon[0]), str(exon[1])))

            # We get the reference sequence
            ref_seq = self.fasta_extractor.extract(interval)

            # We get the variants, insert them and also save them as metadata
            variants = list(self.vcf.fetch_variants(interval))
            if len(variants) == 0:
                ref_exons.append(ref_seq)
                var_exons.append(ref_seq)
            else:
                var_seq = self.vcf_extractor.extract(interval,
                                                     variants=variants,
                                                     anchor=0,
                                                     fixed_len=False)
                var_string = ";".join([str(var) for var in variants])

                ref_exons.append(ref_seq)
                var_exons.append(var_seq)
                exon_var_strings.append(var_string)

        # Combine
        if entry_strand == "-":
            ref_exons.reverse()
            var_exons.reverse()
        ref_seq = "".join(ref_exons)
        var_seq = "".join(var_exons)
        pos_string = ";".join(exon_pos_strings)
        var_string = ";".join(exon_var_strings)

        return {
            "inputs": {
                "ref_seq": ref_seq,
                "alt_seq": var_seq,
            },
            "metadata": {
                "id": entry_id,
                "chr": entry_chr,
                "exon_positions": pos_string,
                "strand": entry_strand,
                "variants": var_string
            }
        }