Пример #1
0
def get_QP_sample_mask(species_name):
    sample_names = get_sample_names(species_name)

    QP_samples = set(diversity_utils.calculate_haploid_samples(species_name))
    highcoverage_samples = set(
        diversity_utils.calculate_highcoverage_samples(species_name))
    allowed_samples = QP_samples & highcoverage_samples
    return np.isin(sample_names, list(allowed_samples)), sample_names
def process_one_species(species_name):
    if os.path.exists(os.path.join(config.analysis_directory, 'allele_freq', species_name)):
        print("{} already processed".format(species_name))
        return

    samples, sfs_map = parse_midas_data.parse_within_sample_sfs(
        species_name, allowed_variant_types=set(['4D']))
    highcoverage_samples = list(
        diversity_utils.calculate_highcoverage_samples(species_name))

    for sample in highcoverage_samples:
        all_fs, all_pfs = sfs_utils.calculate_binned_sfs_from_sfs_map(
            sfs_map[sample], folding='major')
        df = all_fs[1] - all_fs[0]
        # For peak finding, only use the polymorphic sites
        pfs = all_pfs[all_fs < 0.95]
        fs = all_fs[all_fs < 0.95]

        # Find the max peak size
        within_sites, between_sites, total_sites = sfs_utils.calculate_polymorphism_rates_from_sfs_map(
            sfs_map[sample])
        between_line = between_sites*1.0 / \
            total_sites/((fs > 0.2)*(fs < 0.5)).sum()
        pmax = np.max([pfs[(fs > 0.1)*(fs < 0.95)].max(), between_line])

        peak_idx, cutoff = HGT_utils._find_sfs_peaks_and_cutoff(fs, pfs, pmax)

        num_peaks = len(peak_idx)

        # Now plot and save the figure
        _ = plt.figure()
        ax = plt.gca()
        ax.set_xlim([0.50, 1.00])
        ax.set_ylim([0, pmax*3])
        ax.bar((all_fs-df/2), all_pfs, width=df)
        ax.plot(fs[peak_idx]-df/2, pfs[peak_idx], 'rx', label='peaks detected')
        ax.set_xlabel('Major allele freq')

        if cutoff:
            ax.axvspan(min(fs), cutoff, alpha=0.1, color='red', label='SNP sites')
        ax.legend()

        path = os.path.join(config.analysis_directory, 'allele_freq',
                            species_name, str(num_peaks))
        if not os.path.exists(path):
            os.makedirs(path)
        plt.savefig(path + '/' + sample + '.png')
        plt.close()
Пример #3
0
def get_desired_samples(species_name, between_host=False):
    highcoverage_samples = set(
        diversity_utils.calculate_highcoverage_samples(species_name))
    if between_host:
        QP_samples = set(
            diversity_utils.calculate_haploid_samples(species_name))
        return QP_samples & highcoverage_samples
    else:
        single_peak_dir = os.path.join(config.analysis_directory,
                                       'allele_freq', species_name, '1')
        if not os.path.exists(single_peak_dir):
            print("Please plot sfs by peaks first for {}".format(species_name))
            return None
        desired_samples = set([
            f.split('.')[0] for f in os.listdir(single_peak_dir)
            if not f.startswith('.')
        ])
        return desired_samples & highcoverage_samples
Пример #4
0
def get_single_peak_sample_mask(species_name):
    """
    Compute a mask that keep only samples suitable for within host analysis
    A sample need to be 1) well covered, 2) has single clean peak
    The list of sample names and the list of peak cutoffs will also be returned
    """
    sample_names = get_sample_names(species_name)

    blacklist = set(HGT_utils.get_within_host_bad_samples(species_name))

    highcoverage_samples = set(
        diversity_utils.calculate_highcoverage_samples(species_name))
    single_peak_dir = os.path.join(config.analysis_directory, 'allele_freq',
                                   species_name, '1')
    if not os.path.exists(single_peak_dir):
        print("No single peak samples found for {}".format(species_name))
        mask = np.zeros(len(sample_names)).astype(bool)
        return mask, sample_names, np.array([])
    else:
        single_peak_samples = set([
            f.split('.')[0] for f in os.listdir(single_peak_dir)
            if not f.startswith('.')
        ])
        allowed_samples = single_peak_samples & highcoverage_samples - blacklist
    mask = np.isin(sample_names, list(allowed_samples))

    # filter samples with a clean single peak
    _, sfs_map = parse_midas_data.parse_within_sample_sfs(
        species_name, allowed_variant_types=set(['4D']))
    results = [
        HGT_utils.find_sfs_peaks_and_cutoff(sample, sfs_map)
        for sample in sample_names[mask]
    ]
    cutoffs = np.array([res[1] for res in results])
    clean_peak_mask = np.array([cutoff is not None for cutoff in cutoffs])
    mask[mask] = clean_peak_mask
    good_cutoffs = cutoffs[clean_peak_mask]
    return mask, sample_names, good_cutoffs.astype(float)
Пример #5
0
    good_species_list = parse_midas_data.parse_good_species_list()
    if species!='all':
        good_species_list = [species]
    else:    
        if debug:
            good_species_list = good_species_list[:3]
    
    # header for the output file.
    record_strs = []
    
    for species_name in good_species_list:

        sys.stderr.write("Loading samples...\n")

        # Only plot samples above a certain depth threshold that are confidently phaseable.
        snp_samples = diversity_utils.calculate_highcoverage_samples(species_name, min_coverage=min_coverage)
        
        if len(snp_samples)<2:
            continue
            
        sys.stderr.write("found %d samples\n" % len(snp_samples))
        
        # Analyze SNPs, looping over chunk sizes. 
        # Clunky, but necessary to limit memory usage on cluster

        # Load SNP information for species_name
        sys.stderr.write("Loading SNPs for %s...\n" % species_name)
        
        snps = []
        snp_map = {} # contig: list of locations map
        
        for j in xrange(i + 1, divergence_matrices[species_name].shape[0]):

            if divergence_matrices[species_name][i, j] >= 0:

                between_divergences[species_name].append(
                    divergence_matrices[species_name][i, j])
    between_divergences[species_name] = numpy.array(
        between_divergences[species_name])

    # Load SNP information for species_name
    sys.stderr.write("Loading SFSs for %s...\t" % species_name)
    sfs_samples, sfs_map = parse_midas_data.parse_within_sample_sfs(
        species_name)
    sys.stderr.write("Done!\n")

    highcoverage_samples = diversity_utils.calculate_highcoverage_samples(
        species_name)
    desired_samples = snp_samples

    within_polymorphisms[species_name] = []
    for sample in desired_samples:
        within_sites, between_sites, total_sites = sfs_utils.calculate_polymorphism_rates_from_sfs_map(
            sfs_map[sample])
        within_polymorphisms[species_name].append(within_sites * 1.0 /
                                                  total_sites)

species_names = []
sample_sizes = []
avg_divergences = []

for species_name in species_phylogeny_utils.sort_phylogenetically(
        divergence_matrices.keys()):