예제 #1
0
    def _load_monomer_info(annotations_file, identities_file, target_sequence,
                           alignment_file, use_best_reciprocal,
                           identity_threshold):

        # read in annotation to a file and rename the appropriate column
        annotation_table = read_species_annotation_table(annotations_file)

        # read identity file
        similarities = pd.read_csv(identities_file)

        # create a pd.DataFrame containing the best hit in each organism
        most_similar_in_species = most_similar_by_organism(
            similarities, annotation_table)

        if use_best_reciprocal:
            paralogs = find_paralogs(target_sequence, annotation_table,
                                     similarities, identity_threshold)

            most_similar_in_species = filter_best_reciprocal(
                alignment_file, paralogs, most_similar_in_species)

        return most_similar_in_species
예제 #2
0
def describe_concatenation(annotation_file_1, annotation_file_2,
                           genome_location_filename_1, genome_location_filename_2,
                           outfile):
    """
    Describes properties of concatenated alignment. 

    Writes a csv with the following columns

    num_seqs_1 : number of sequences in the first monomer alignment
    num_seqs_2 : number of sequences in the second monomer alignment
    num_nonred_species_1 : number of unique species annotations in the 
        first monomer alignment
    num_nonred_species_2 : number of unique species annotations in the 
        second monomer alignment
    num_species_overlap: number of unique species found in both alignments
    median_num_per_species_1 : median number of paralogs per species in the 
        first monomer alignmment
    median_num_per_species_2 : median number of paralogs per species in 
        the second monomer alignment
    num_with_embl_cds_1 : number of IDs for which we found an EMBL CDS in the 
        first monomer alignment (relevant to distance concatention only)
    num_with_embl_cds_2 : number of IDs for which we found an EMBL CDS in the 
        first monomer alignment (relevant to distance concatention only)
    
    Parameters
    ----------
    annotation_file_1 : str
        Path to annotation.csv file for first monomer alignment
    annotation_file_2 : str
        Path to annotation.csv file for second monomer alignment
    genome_location_filename_1 : str
        Path to genome location mapping file for first alignment
    genome_location_filename_2 : str
        Path to genome location mapping file for second alignment
    outfile: str
        Path to output file
    """

    # load the annotations for each alignment
    # as a pd.DataFrame
    annotations_1 = read_species_annotation_table(
        annotation_file_1
    )
    species_1 = annotations_1.species.values

    annotations_2 = read_species_annotation_table(
        annotation_file_2
    )
    species_2 = annotations_2.species.values
    
    # calculate the number of sequences found in each alignment
    num_seqs_1 = len(annotations_1)
    num_seqs_2 = len(annotations_2)
    
    # calculate the number of species found in each alignment
    # where a species is defined as a unique OS or Tax annotation field
    nonredundant_annotations_1 = len(set(species_1))
    nonredundant_annotations_2 = len(set(species_2))

    # calculate the number of overlapping species
    species_overlap = list(
        set(species_1).intersection(set(species_2))
    )
    n_species_overlap = len(species_overlap)
    
    # calculate the median number of paralogs per species
    n_paralogs_1 = float(
        # counts the number of times each species occurs in the list
        # then takes the median
        np.median(list(Counter(species_1).values()))
    )

    n_paralogs_2 = float(
        np.median(list(Counter(species_2).values()))
    )
    
    # If the user provided genome location files, calculate the number
    # of ids for which we found an embl CDS
    if genome_location_filename_1 is not None and \
    genome_location_filename_2 is not None:

        genome_location_table_1 = pd.read_csv(genome_location_filename_1)
        genome_location_table_2 = pd.read_csv(genome_location_filename_2)

        # Number uniprot IDs with EMBL CDS that is not NA
        embl_cds1 = len(list(set(genome_location_table_1.uniprot_ac)))
        embl_cds2 = len(list(set(genome_location_table_2.uniprot_ac)))

    else:
        embl_cds1 = np.nan
        embl_cds2 = np.nan

    concatenation_data = [
        num_seqs_1,
        num_seqs_2,
        nonredundant_annotations_1,
        nonredundant_annotations_2,
        n_species_overlap,
        n_paralogs_1,
        n_paralogs_2,
        embl_cds1,
        embl_cds2,
    ]
    
    cols = [
        "num_seqs_1",
        "num_seqs_2",
        "num_nonred_species_1",
        "num_nonred_species_2",
        "num_species_overlap",
        "median_num_per_species_1",
        "median_num_per_species_2",
        "num_with_embl_cds_1",
        "num_with_embl_cds_2",
    ]

    # create dataframe and store
    data_df = pd.DataFrame(
        [concatenation_data], columns=cols
    )

    data_df.to_csv(outfile)