Exemplo n.º 1
0
def main():
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """

    logger = logging.getLogger(__name__)
    logger.info('downloading genomes from NCBI')
    session = rfam_session()
    genomes_query = session.query(Genome).filter(
        or_(Genome.kingdom == 'archaea',
            Genome.kingdom == 'bacteria')).filter(Genome.assembly_acc != '')
    session.close()

    for genome in genomes_query:
        download_genome(genome)

    for genome in genomes_query:
        pickle_filename = "/home/jovyan/work/data/interim/igr_df_pickles/" + genome.assembly_acc + ".bz2"

        if not os.path.isfile(pickle_filename):

            igr_df = extract_igrs(genome)
            annotated_igr_df = annotate_igrs(genome, igr_df)

            annotated_igr_df.to_pickle(pickle_filename)
Exemplo n.º 2
0
def display_genome(upid):

    session = rfam_session()
    genome = session.query(Genome).get(upid)
    session.close()
    download_genome(genome)
    igr_df = extract_igrs(genome, igr_length_cutoff=1)
    annotated_df = annotate_igrs(genome, igr_df)
    scatter_plots = graph_genome(annotated_df)
    layout = graph_layout(genome)
    fig = go.FigureWidget(data=scatter_plots, layout=layout)

    return annotated_df, fig, layout, genome
Exemplo n.º 3
0
def annotate_igrs(genome, igr_df):
    """
    Annotate the inter-genic regions listed in a dataframe with any available annotations from Rfam

    Parameters
    ----------
    genome: src.data.rfam_db.Genome
        The genome object for the organism who's IGR's are being analyzed
    igr_df: pandas.Dataframe
        The dataframe with the columns 'accession', 'start', 'end', 'length', 'gc'
    Returns
    -------
    annotated_igr_df: pandas.Dataframe
    """

    # Initialize connection to Rfam database
    session = rfam_session()

    # Get the list of "rfamseq_acc" numbers for a given organism
    rfamseq_acc_list = session.query(t_genseq.c.rfamseq_acc).filter(
        t_genseq.c.upid == genome.upid).distinct().all()

    # Create a list to store all the interval trees
    annotation_tree_dict = {}

    for rfamseq_acc in rfamseq_acc_list:

        # Pull rfamseq_acc out of the list
        rfamseq_acc = rfamseq_acc[0]

        rna_query = session.query(t_full_region).filter(
            t_full_region.c.rfamseq_acc == rfamseq_acc)
        rna_list = rna_query.all()

        # Make an interval tree for all of the RNA annotations to allow for rapid overlap search
        annotation_tree = IntervalTree()

        # Go though and add each RNA annotation to the interval tree
        for rna in rna_list:
            start = min(rna.seq_start, rna.seq_end)
            end = max(rna.seq_start, rna.seq_end)

            annotation_interval = Interval(start=start,
                                           end=end,
                                           chrom=rna.rfamseq_acc,
                                           value=rna)
            annotation_tree.insert_interval(annotation_interval)

        rfamseq_acc_stripped = rfamseq_acc.partition('.')[0]
        annotation_tree_dict[rfamseq_acc_stripped] = annotation_tree

    # Make an empty list of all the igrs with annotations
    annotated_igr_list = []
    for accession, accession_igr_df in igr_df.groupby('accession'):
        # Lookup the RNA annotation tree for the given accession
        try:
            annotation_tree = annotation_tree_dict[accession]
        except KeyError:
            print("IGR dataframe key: {} not found. Available keys are: {}".
                  format(accession, annotation_tree_dict.keys()))

        # For each IGR find all of the overlaps with annotated RNAs
        for igr in accession_igr_df.itertuples():

            overlap_list = annotation_tree.find(igr.start, igr.end)
            for overlap in overlap_list:
                # Add the IGR to the annotated_igr_list
                annotated_igr_list.append({
                    'igr_index': igr[0],
                    'rfam_acc': overlap.value.rfam_acc
                })

    # Convert annotated_igr_list into dataframe and merge on the rfam_acc
    annotated_igr_df = pd.merge(igr_df,
                                pd.DataFrame(annotated_igr_list,
                                             columns=["igr_index",
                                                      "rfam_acc"]),
                                on="igr_index",
                                how='left')

    # Look up the information for all of the RNA families represented in this genome
    rna_family_query = session.query(Family)\
                              .with_entities(Family.rfam_acc, Family.rfam_id, Family.description, Family.type)\
                              .filter(Family.rfam_acc.in_(annotated_igr_df["rfam_acc"].dropna().unique()))
    rna_families_df = pd.read_sql(rna_family_query.statement,
                                  rna_family_query.session.bind)

    merged_igr_df = pd.merge(annotated_igr_df,
                             rna_families_df,
                             on="rfam_acc",
                             how="left")

    combined_descriptions = merged_igr_df.dropna().groupby("igr_index")\
                                                  .agg(dict(rfam_acc=lambda x: ','.join(set(x)),
                                                            rfam_id=lambda x: ','.join(set(x)),
                                                            type=lambda x: ','.join(set(x)),
                                                            description=lambda x: '<br>'.join(set(x))))
    merged_igr_df.drop_duplicates(["igr_index"], inplace=True)
    merged_igr_df.reset_index(inplace=True, drop=True)
    merged_igr_df.update(combined_descriptions)

    merged_igr_df["category"] = merged_igr_df.apply(
        lambda row: categorize_igr(row), axis=1)

    merged_igr_df["log_length"] = np.log(merged_igr_df["length"])
    session.close()
    return merged_igr_df