def _load_monomer_info(annotations_file, identities_file, target_sequence, alignment_file, use_best_reciprocal, identity_threshold): # read in annotation to a file and rename the appropriate column annotation_table = read_species_annotation_table(annotations_file) # read identity file similarities = pd.read_csv(identities_file) # create a pd.DataFrame containing the best hit in each organism most_similar_in_species = most_similar_by_organism( similarities, annotation_table) if use_best_reciprocal: paralogs = find_paralogs(target_sequence, annotation_table, similarities, identity_threshold) most_similar_in_species = filter_best_reciprocal( alignment_file, paralogs, most_similar_in_species) return most_similar_in_species
def describe_concatenation(annotation_file_1, annotation_file_2, genome_location_filename_1, genome_location_filename_2, outfile): """ Describes properties of concatenated alignment. Writes a csv with the following columns num_seqs_1 : number of sequences in the first monomer alignment num_seqs_2 : number of sequences in the second monomer alignment num_nonred_species_1 : number of unique species annotations in the first monomer alignment num_nonred_species_2 : number of unique species annotations in the second monomer alignment num_species_overlap: number of unique species found in both alignments median_num_per_species_1 : median number of paralogs per species in the first monomer alignmment median_num_per_species_2 : median number of paralogs per species in the second monomer alignment num_with_embl_cds_1 : number of IDs for which we found an EMBL CDS in the first monomer alignment (relevant to distance concatention only) num_with_embl_cds_2 : number of IDs for which we found an EMBL CDS in the first monomer alignment (relevant to distance concatention only) Parameters ---------- annotation_file_1 : str Path to annotation.csv file for first monomer alignment annotation_file_2 : str Path to annotation.csv file for second monomer alignment genome_location_filename_1 : str Path to genome location mapping file for first alignment genome_location_filename_2 : str Path to genome location mapping file for second alignment outfile: str Path to output file """ # load the annotations for each alignment # as a pd.DataFrame annotations_1 = read_species_annotation_table( annotation_file_1 ) species_1 = annotations_1.species.values annotations_2 = read_species_annotation_table( annotation_file_2 ) species_2 = annotations_2.species.values # calculate the number of sequences found in each alignment num_seqs_1 = len(annotations_1) num_seqs_2 = len(annotations_2) # calculate the number of species found in each alignment # where a species is defined as a unique OS or Tax annotation field nonredundant_annotations_1 = len(set(species_1)) nonredundant_annotations_2 = len(set(species_2)) # calculate the number of overlapping species species_overlap = list( set(species_1).intersection(set(species_2)) ) n_species_overlap = len(species_overlap) # calculate the median number of paralogs per species n_paralogs_1 = float( # counts the number of times each species occurs in the list # then takes the median np.median(list(Counter(species_1).values())) ) n_paralogs_2 = float( np.median(list(Counter(species_2).values())) ) # If the user provided genome location files, calculate the number # of ids for which we found an embl CDS if genome_location_filename_1 is not None and \ genome_location_filename_2 is not None: genome_location_table_1 = pd.read_csv(genome_location_filename_1) genome_location_table_2 = pd.read_csv(genome_location_filename_2) # Number uniprot IDs with EMBL CDS that is not NA embl_cds1 = len(list(set(genome_location_table_1.uniprot_ac))) embl_cds2 = len(list(set(genome_location_table_2.uniprot_ac))) else: embl_cds1 = np.nan embl_cds2 = np.nan concatenation_data = [ num_seqs_1, num_seqs_2, nonredundant_annotations_1, nonredundant_annotations_2, n_species_overlap, n_paralogs_1, n_paralogs_2, embl_cds1, embl_cds2, ] cols = [ "num_seqs_1", "num_seqs_2", "num_nonred_species_1", "num_nonred_species_2", "num_species_overlap", "median_num_per_species_1", "median_num_per_species_2", "num_with_embl_cds_1", "num_with_embl_cds_2", ] # create dataframe and store data_df = pd.DataFrame( [concatenation_data], columns=cols ) data_df.to_csv(outfile)