Пример #1
0
def gene_sequence_helper(assembly, geneid):

    db = gffutils.FeatureDB('/fastdata/zlab-genomes/gffutils/{0}.db'.format(assembly), keep_order=True)
    gene = db[geneid]
    sf = biopython_integration.to_seqfeature(gene)
    seq_letters = gene.sequence("/fastdata/refseq/{0}.refseq.fa".format(assembly))

    return seq_letters
Пример #2
0
def test_roundtrip():
    """
    Feature -> SeqFeature -> Feature should be invariant.
    """
    db_fname = gffutils.example_filename("gff_example1.gff3")
    db = gffutils.create_db(db_fname, ':memory:')
    feature = db['ENSMUSG00000033845']
    feature.keep_order = True
    dialect = feature.dialect
    s = bp.to_seqfeature(feature)
    assert s.location.start.position == feature.start - 1
    assert s.location.end.position == feature.stop
    f = bp.from_seqfeature(s, dialect=dialect, keep_order=True)
    assert feature == f
Пример #3
0
def parse_cds_features(features, record_start):
    cds = []
    gene = []
    for feature in features:
        feature = biopython_integration.to_seqfeature(feature)
        feature.location = FeatureLocation(feature.location.start -
                                           record_start,
                                           feature.location.end - record_start,
                                           strand=feature.location.strand)
        if feature.type == "CDS":
            cds.append(feature)
        else:
            gene.append(feature)
    return cds, gene
def test_roundtrip():
    """
    Feature -> SeqFeature -> Feature should be invariant.
    """
    db_fname = gffutils.example_filename("gff_example1.gff3")
    db = gffutils.create_db(db_fname, ':memory:')
    feature = db['ENSMUSG00000033845']
    feature.keep_order = True
    dialect = feature.dialect
    s = bp.to_seqfeature(feature)
    assert s.location.start.position == feature.start - 1
    assert s.location.end.position == feature.stop
    f = bp.from_seqfeature(s, dialect=dialect, keep_order=True)
    assert feature == f
Пример #5
0
def gene_fasta_helper(assembly, geneid, returntype="filename"):
        
    db = gffutils.FeatureDB('/fastdata/zlab-genomes/gffutils/{0}.db'.format(assembly), keep_order=True)
    gene = db[geneid]
    sf = biopython_integration.to_seqfeature(gene)
    seq_letters = gene.sequence("/fastdata/refseq/{0}.refseq.fa".format(assembly))
    
    record  = SeqRecord(Seq(seq_letters,Alphabet.DNAAlphabet()),
                        id=geneid, name=gene.attributes["Name"][0],
                        description="gene region exported by crispr.mit.edu",
                        features=[sf])

    fname = "/fastdata/webserver/tmp/{0}.fa".format(long(random.random()*1000000))
    with open(fname,"w") as f:
        SeqIO.write(record,f,"fasta")

    if returntype=="filename":
        return fname
    elif returntype =="text":
        with open(fname) as fopen:
            return fopen.read()
    else:
        raise Exception("unknown return type {0}".format(returntype))
Пример #6
0
def parse_gff(path):
    """Parses a GFF3 file using GFFUtils."""

    # Check for FASTA file
    fasta_path = find_fasta(path)
    if not fasta_path:
        raise FileNotFoundError("Could not find matching FASTA file")

    # Parse FASTA and create GFFUtils database
    fasta = parse_fasta(fasta_path)
    gff = gffutils.create_db(str(path),
                             ":memory:",
                             force=True,
                             merge_strategy="create_unique",
                             sort_attribute_values=True)
    regions = find_regions(gff.directives)

    # Find features for each record in the FASTA file
    loci = []
    for record in fasta:
        try:
            record_start, _ = regions[record.id]
        except KeyError:
            record_start = 1
        features = list(gff.region(seqid=record.id, featuretype="CDS"))
        features.sort(key=lambda f: f.start)
        if not features:
            raise ValueError(f"Found no CDS features in {record.id} [{path}]")

        # Calculate offset based on start of record
        # sequence-region not zero-indexed, so +1
        record_start -= 1

        previous = None
        for feature in features:
            # Check if this feature is part of the previous one for merging
            seqid = feature.attributes["ID"][0]
            same_feature = previous == seqid
            if not previous:
                previous = seqid

            # Normalise Feature location based on ##sequence-region directive.
            # Necessary for extracted GFF3 files that still store coordinates
            # relative to the entire region. If no sequence-region directive
            # is found, assumes 1 (i.e. default sequence start).
            # Note: to_seqfeature automatically zero indexes coordinates, which
            # does not happen by default in GFFUtils, hence no -1 here
            feature = biopython_integration.to_seqfeature(feature)
            feature.location = FeatureLocation(
                feature.location.start - record_start,
                feature.location.end - record_start,
                strand=feature.location.strand)

            # Either merge with previous feature, or append it
            if same_feature:
                if feature.location.strand == 1:
                    record.features[-1].location += feature.location
                else:
                    # Must be in biological order
                    old, new = record.features[-1].location, feature.location
                    record.features[-1].location = new + old
            else:
                record.features.append(feature)
                previous = seqid

        # Try to trace back from CDS to parent gene feature for actual
        # gene coordinates. If not found (e.g. malformed GFF without ID= and parent=
        # features), warns user and defaults to CDS start/end.
        genes = []
        for feature in record.features:
            parents = [
                p for p in gff.parents(gff[feature.id], featuretype="gene")
            ]
            start, end = None, None
            if parents:
                start = parents[0].start - record_start - 1
                end = parents[0].end - record_start
            else:
                LOG.warning(f"Could not find parent gene of {feature.id}."
                            " Using coding sequence coordinates instead.")
            gene = Gene.from_seqfeature(feature, record, start=start, end=end)
            genes.append(gene)

        locus = Locus(record.id, genes, 0, len(record))
        loci.append(locus)

    return Cluster(Path(path).stem, loci)
Пример #7
0
def gene_genbank_spacers_data_helper(data, assembly, geneid, returntype="filename",
                                spacer_sequence_filter=None,
                                tool_filter=None,
                                min_score=90):


    
    db = gffutils.FeatureDB('/fastdata/zlab-genomes/gffutils/{0}.db'.format(assembly), keep_order=True)
    gene = db[geneid]
    sf = biopython_integration.to_seqfeature(gene)
    seq_letters = gene.sequence("/fastdata/refseq/{0}.refseq.fa".format(assembly))


 


    cas9_spacers = data["cas9"]["spacers"]
    cpf1_spacers = data["cas9"]["spacers"]

    
    sfs = []
    count=0
    for tool,spacer_list in {"cas9":cas9_spacers,
                             "cpf1":cpf1_spacers}.items():
        if tool_filter != None:
            if tool != tool_filter:
                continue
        for s in spacer_list:
            if spacer_sequence_filter:
                if s["guide_sequence"] != spacer_sequence_filter:
                    continue
            if min_score != None:
                
                #print s["score"]
                if s["score"] < min_score:
                    continue

            quals = {}
            if s["pam_before"]:
                quals.update({"upstream_pam":s["pam_before"]})
                
            if s["pam_after"]:
                quals.update({"downstream_pam":s["pam_after"]})

            quals.update({"score":s["score"],
                          "tool":tool,
                          "target_seq":s["guide_sequence"]})
        
                
            sfs.append(SeqFeature(FeatureLocation(s["guide_start"],
                                                  s["guide_start"]+s["guide_length"],
                                                  strand=s["guide_strand"]),
                                  id="guide{0}".format(count),
                                  qualifiers=quals,
                                  type="{0}_guide".format(tool)))
            count+=1



    
    record  = SeqRecord(Seq(seq_letters,Alphabet.DNAAlphabet()),
                        id=geneid, name=gene.attributes["Name"][0],
                        description="{2} gene {1} exported by crispr.mit.edu, with all spacer sequences scored >{0}".format(min_score,db[geneid].attributes['Name'][0],assembly),
                        features=[sf]+sfs,
                        annotations={"organism":assembly})

   
    
    fname = "/fastdata/webserver/tmp/{0}.gb".format(long(random.random()*1000000))
    with open(fname,"w") as f:
        SeqIO.write(record,f,"genbank")

    
    if returntype=="filename":
        return fname
    elif returntype=="text":
        with open(fname) as fopen:
            return fopen.read()
    else:
        raise Exception("unknown return type {0}".format(returntype))
Пример #8
0
def getRandomFusions(db, names, num=5, pStay=0.0):
    # db is the database from module.py
    # names is a vector of the ENSG gene ids (protein coding genes only) from module.py
    # num: number of fusions to simulate
    # pStay: the probability of staying in the same gene pair to generate another fusion isoform. Set to 0.0 to get only one isoform.
    #random.seed(time.time())    #for final code, add parameters for seed

    res = list()  # the list to store the dictionaries for fusions
    # donorTranId, acceptorTranId donorJunction, acceptorJunction

    if len(names) < 2:
        print("Not enough protein coding genes.")
        exit(1)

    total = 0
    tossed = 0
    while total < num:
        # Select genes.
        dId = random.randint(0, len(names) - 1)
        aId = random.randint(0, len(names) - 1)

        # Discard the result if the genes selected are the same.
        if dId == aId:
            tossed = tossed + 1
            if tossed > MAX_TOSS_NUM:
                print("Tossed > " + str(MAX_TOSS_NUM) +
                      " times in generating a pair of genes.")
                exit(1)
            continue

        dGene = db[names[dId]]
        aGene = db[names[aId]]

        # Decide whether to keep the same transcript pair for the next fusion event.
        tossed2 = 0
        keepSame = True
        while keepSame is True:
            keepSame = isStay(pStay)
            # Choose transcripts
            dStrand, dTran = getTranscript(db, dGene)
            aStrand, aTran = getTranscript(db, aGene)
            if (dTran is None) or (aTran is None): continue
            # Choose junctions
            dIsSucess, dExons = getExons(db, dTran)
            aIsSucess, aExons = getExons(db, aTran)
            if dIsSucess and aIsSucess:
                dExonSF = list()
                aExonSF = list()
                for exon in dExons:
                    dExonSF.append(biopython_integration.to_seqfeature(exon))
                for exon in aExons:
                    aExonSF.append(biopython_integration.to_seqfeature(exon))
                if (len(dExonSF) > 0) and (len(aExonSF) > 0):
                    # create fusion event object, and adjust junction positions
                    # to be 0-based
                    fus = FusionEvent(dExonSF, aExonSF, dStrand, aStrand)
                    res.append(fus)
                    total = total + 1
            else:
                tossed2 = tossed2 + 1
                if tossed2 > MAX_TOSS_NUM:
                    print("Tossed > " + str(MAX_TOSS_NUM) +
                          " times in generating fusion junctions.")
                    exit(1)
    return (res)
Пример #9
0
def cluster_from_gff(path, ranges=None):
    """Parses a GFF3 file using GFFUtils."""

    # Check for FASTA file
    fasta_path = find_fasta(path)
    if not fasta_path:
        raise FileNotFoundError("Could not find matching FASTA file")

    # Parse FASTA and create GFFUtils database
    fasta = parse_fasta(fasta_path)
    gff = gffutils.create_db(str(path),
                             ":memory:",
                             force=True,
                             merge_strategy="create_unique",
                             sort_attribute_values=True)
    regions = find_regions(gff.directives)

    # Find features for each record in the FASTA file
    loci = []
    for record in fasta:
        # Check for matching ##sequence-region directive
        try:
            record_start, record_end = regions[record.id]
        except KeyError:
            record_start, record_end = 1, len(record)

        # Check for user-specified range
        if ranges and record.id in ranges:
            record_start, record_end = ranges[record.id]
            LOG.info("    Parsing range %s:%i-%i", record.id, record_start,
                     record_end)

            # Adjust FASTA record to match record_start and record_end
            # -- Default: 0 to end of record
            # -- ##sequence-region: start to end of directive
            # -- User-specified range: start to end of range
            record = record[record_start - 1:record_end]

        # Zero-index the start of the record
        record_start -= 1

        # Extract features from record within range
        region = gff.region(
            seqid=record.id,
            featuretype="CDS",
            start=record_start,
            end=record_end,
            completely_within=True,
        )
        features = sorted(region, key=lambda f: f.start)
        if not features:
            raise ValueError(f"Found no CDS features in {record.id} [{path}]")

        previous = None
        for feature in features:
            # Check if this feature is part of the previous one for merging
            seqid = feature.attributes["ID"][0]
            same_feature = previous == seqid
            if not previous:
                previous = seqid

            # Normalise Feature location based on ##sequence-region directive.
            # Necessary for extracted GFF3 files that still store coordinates
            # relative to the entire region. If no sequence-region directive
            # is found, assumes 1 (i.e. default sequence start).
            # Note: to_seqfeature automatically zero indexes coordinates, which
            # does not happen by default in GFFUtils, hence no -1 here
            feature = biopython_integration.to_seqfeature(feature)
            feature.location = FeatureLocation(
                feature.location.start - record_start,
                feature.location.end - record_start,
                strand=feature.location.strand)

            # Either merge with previous feature, or append it
            if same_feature:
                if feature.location.strand == 1:
                    record.features[-1].location += feature.location
                else:
                    # Must be in biological order
                    old, new = record.features[-1].location, feature.location
                    record.features[-1].location = new + old
            else:
                record.features.append(feature)
                previous = seqid

        # Try to trace back from CDS to parent gene feature for actual
        # gene coordinates. If not found (e.g. malformed GFF without ID= and parent=
        # features), warns user and defaults to CDS start/end.
        genes = []
        for feature in record.features:
            parents = [
                p for p in gff.parents(gff[feature.id], featuretype="gene")
            ]
            if parents:
                # e.g. CDS is within range, but gene UTR is not
                parent, *_ = parents
                if parent.start < record_start or parent.end > record_end:
                    continue
                start = parent.start
                end = parent.end
            else:
                LOG.warning(f"Could not find parent gene of {feature.id}."
                            " Using coding sequence coordinates instead.")
                start = feature.location.start + record_start
                end = feature.location.end + record_start
            gene = Gene.from_seqfeature(feature, record, start=start, end=end)
            genes.append(gene)

        locus = Locus(record.id, genes, start=record_start, end=record_end)
        loci.append(locus)

    return Cluster(Path(path).stem, loci)
Пример #10
0
def gene_genbank_spacers_helper(assembly, geneid, returntype="filename",
                                spacer_sequence_filter=None,
                                tool_filter=None,
                                min_score=90):


    
    db = gffutils.FeatureDB('/fastdata/zlab-genomes/gffutils/{0}.db'.format(assembly), keep_order=True)
    gene = db[geneid]
    sf = biopython_integration.to_seqfeature(gene)
    seq_letters = gene.sequence("/fastdata/refseq/{0}.refseq.fa".format(assembly))


    
    gene_queries_directory = "/fastdata/crispr/gene_queries"
    query_data_basename = "{assembly}_{geneid}_data.json".format(assembly=assembly,geneid=geneid)
    query_data_file = os.path.join(gene_queries_directory,query_data_basename)
    with open(query_data_file) as fopen:
        status = sjson.loads(fopen.next())
        data = sjson.loads(fopen.next())


    cas9_spacers = data["cas9"]["spacers"]
    cpf1_spacers = data["cpf1"]["spacers"]

    
    sfs = []
    count=0
    for tool,spacer_list in {"cas9":cas9_spacers,
                             "cpf1":cpf1_spacers}.items():
        if tool_filter != None:
            if tool != tool_filter:
                continue
        print tool

        for s in spacer_list:
            if spacer_sequence_filter:            
                if type(spacer_sequence_filter) == str or type(spacer_sequence_filter) == unicode:
                    if s["guide_sequence"] != spacer_sequence_filter:
                        continue
                elif type(spacer_sequence_filter) == list:
                    if not s["guide_sequence"] in spacer_sequence_filter:
                        continue
            if min_score != None:
                
                print s["score"]
                if s["score"] < min_score:
                    continue

            quals = {}
            if s["pam_before"]:
                quals.update({"upstream_pam":s["pam_before"]})
                
            if s["pam_after"]:
                quals.update({"downstream_pam":s["pam_after"]})

            quals.update({"score":s["score"],
                          "tool":tool,
                          "target_seq":s["guide_sequence"]})

            ot_mms = dict([["offtarget_{0}_mms".format(i),ot["mismatches"]] for i,ot in enumerate( s["offtarget_alignments"]) ])
            quals.update(ot_mms)
            
            ot_loci = dict([["offtarget_{0}_locus".format(i),"{0} {1}{2}".format(ot["chrom"],ot["strand"],ot["start"])] for i,ot in enumerate( s["offtarget_alignments"]) ])
            quals.update(ot_loci)
        
                
            sfs.append(SeqFeature(FeatureLocation(s["guide_start"],
                                                  s["guide_start"]+s["guide_length"],
                                                  strand=s["guide_strand"]),
                                  id="guide{0}".format(count),
                                  qualifiers=quals,
                                  type="{0}_guide".format(tool)))
            count+=1



    
    record  = SeqRecord(Seq(seq_letters,Alphabet.DNAAlphabet()),
                        id=geneid, name=gene.attributes["Name"][0],
                        description="{2} gene {1} exported by crispr.mit.edu, with all spacer sequences scored >{0}".format(min_score,db[geneid].attributes['Name'][0],assembly),
                        features=[sf]+sfs,
                        annotations={"organism":assembly})

   
    
    fname = "/fastdata/webserver/tmp/{0}.gb".format(long(random.random()*1000000))
    with open(fname,"w") as f:
        SeqIO.write(record,f,"genbank")

    
    if returntype=="filename":
        return fname
    elif returntype=="text":
        with open(fname) as fopen:
            return fopen.read()
    else:
        raise Exception("unknown return type {0}".format(returntype))
Пример #11
0
def parse_gff(path):
    """Parses GFF and corresponding FASTA using GFFutils.

    Args:
        path (str):
            Path to GFF file. Should have a corresponding FASTA file of the same
            name with a valid FASTA suffix (.fa, .fasta, .fsa, .fna, .faa).
    Returns:
        list: SeqRecord objects corresponding to each scaffold in the file
    """
    fasta = find_fasta(path)
    if not fasta:
        raise FileNotFoundError(f"Could not find partner FASTA file for {path}")

    # Parse FASTA and create GFFUtils database
    fasta = parse_fasta(fasta)
    gff = gffutils.create_db(
        str(path),
        ":memory:",
        force=True,
        merge_strategy="create_unique",
        sort_attribute_values=True
    )
    regions = find_regions(gff.directives)

    # Find features for each record in the FASTA file
    for record in fasta:
        try:
            record_start, _ = regions[record.id]
            record_start -= 1
        except KeyError:
            record_start = 0

        # Normalise Feature location based on ##sequence-region directive.
        # Necessary for extracted GFF3 files that still store coordinates
        # relative to the entire region, not to the extracted FASTA.
        # If no sequence-region directive is found, assumes 1 (i.e. sequence start).
        cds_features = []
        for feature in gff.region(seqid=record.id, featuretype=["gene", "CDS"]):
            feature = biopython_integration.to_seqfeature(feature)
            feature.location = FeatureLocation(
                feature.location.start - record_start,
                feature.location.end - record_start,
                strand=feature.location.strand
            )
            if feature.type == "CDS":
                cds_features.append(feature)
            else:
                record.features.append(feature)

        if not cds_features:
            raise ValueError(f"Found no CDS features in {record.id} [{path}]")

        # Merge CDS features into singular SeqFeature objects, add them to record
        previous = None
        for feature in sorted(cds_features, key=lambda f: f.location.start):
            seqid = feature.qualifiers["ID"][0]
            same_feature = previous == seqid
            if not previous:
                previous = seqid
            if same_feature:
                if feature.location.strand == 1:
                    record.features[-1].location += feature.location
                else:
                    # Reverse strand locations must be in biological order
                    old, new = record.features[-1].location, feature.location
                    record.features[-1].location = new + old
            else:
                record.features.append(feature)
                previous = seqid

        # Sort, then generate insertion tuples like with other formats
        record.features.sort(key=lambda f: f.location.start)

    return fasta