Пример #1
0
def get_MSA_gene_list(coords, coords_file, method, species_set, version,
                      query_species, MSA_file):
    '''
    Given a dictionary of lists of lists of CDS coordinates, retrieve the Compara MSAs.
    '''
    with open(coords_file, "w") as file:
        for trans in coords:
            for exon in coords[trans]:
                phase = exon[1]
                current_coords = exon[0]
                current_coords = [str(i) for i in current_coords]
                current_coords.append(str(phase))
                current_coords = "|".join(current_coords)
                file.write(current_coords)
                file.write("\n")
    remove_file(MSA_file)
    run_process([
        "perl", "MSA_list.pl", method, species_set, version, coords_file,
        query_species, MSA_file
    ])
    with open(MSA_file) as file:
        string = "".join(file)
    string = re.sub("([a-z])\n([a-z])", "\\1\\2", string)
    with open(MSA_file, "w") as file:
        file.write(string)
Пример #2
0
def get_pairwise_alignment(coords, coords_file, query_species, other_species,
                           version, output_file):
    '''
    Given a list of feature coordinates and two species, get the corresponding pairwise alignments from Compara.
    '''
    #write the coordinates to file in a way that can be read by the downstream perl script
    with open(coords_file, "w") as file:
        for feature in coords:
            feature = [str(i) for i in feature]
            feature = "|".join(feature)
            file.write(feature)
            file.write("\n")
    remove_file(output_file)
    #get the alignments from the database
    run_process([
        "perl", "pairwise_from_ensembl.pl", coords_file, query_species,
        other_species, output_file, version
    ])
    #parse them from the output file produced by the perl script
    with open(output_file) as file:
        string = "".join(file)
    string = string.split("***")
    string = [(i.rstrip("\n")).lstrip("\n") for i in string]
    string = [i.split("|||") for i in string]
    string = [[j for j in i if len(j) > 0] for i in string]
    #getting rid of cases where there's multiple GABs
    string = flatten([i for i in string if len(i) == 1])
    #write alignments to a pretty FASTA
    with open(output_file, "w") as file:
        for feature in string:
            temp = feature.split("\n")
            name = temp[0]
            name = name.split("|")
            antisense = False
            if name[6] == "-":
                antisense = True
            try:
                alignments = [temp[2], temp[3]]
                alignments = [i.split(" ") for i in alignments]
                #covert to upper case
                alignments = [([j for j in i if j][1]).upper()
                              for i in alignments]
                #only keep alignments with no ambiguous bases in either sequence
                if "N" not in alignments[0] and "N" not in alignments[1]:
                    #reverse complement, if necessary
                    if antisense:
                        alignments = [
                            str(
                                Seq(i,
                                    IUPAC.unambiguous_dna).reverse_complement(
                                    )) for i in alignments
                        ]
                    file.write(">{0}\n".format("|".join(name)))
                    file.write("|".join(alignments))
                    file.write("\n")
            except IndexError:
                pass
Пример #3
0
def sort_bed(input_file_name, output_file_name):
    '''
    Sort a bed file.
    '''
    #This is done via a temp file because that way you can specify the same file as input and output file and thus
    #overwrite the unsorted file with the sorted one.
    temp_file_name = "temp_data/temp_sorted_bed{0}.bed".format(random.random())
    hk.run_process(["sort-bed", input_file_name], file_for_output = temp_file_name)
    hk.run_process(["mv", temp_file_name, output_file_name])
    hk.remove_file(temp_file_name)
Пример #4
0
def peak_pos_in_exon(exon_starts_file, peaks_file, from_end = False, reads_file = False, reads_mode = False):
    """
    Given a set of exons and a set of peaks, make a dictionary with the peaks overlapping each exon.
    :param exon_starts_file: BED file with the starting regions of those exons that have been chosen for study
    :param peaks_file: BED file of peaks
    :param from_end: if True, distances will be calculated from the ends of exons rather than the starts
    :param reads_mode: if True, assume that output is reads and not peaks. The difference
    is that peaks are flat: every position that overlaps with a peak is a 1. Whereas
    with reads, if a position overlaps with more than one read, then you should count
    it as more than one.
    :return: dictionary with junction IDs as keys and a list of all the
    positions that overlap a peak (relative to the start of the exon,
    counting in the direction of transcription). Also return a second dictionary with the centres of the peaks
    (median rounded down to nearest integer).
    """
    # intersect the exon starts and the peaks
    intersect_output = "{0}_{1}".format(exon_starts_file[:-4], peaks_file.split("/")[-1])
    intersect_bed(exon_starts_file, peaks_file, write_both=True, output_file=intersect_output, force_strand=True,
                  no_dups=False, write_zero=True)
    plus = "+"
    if from_end:
        plus = "-"
    out_dict = {}
    with open(intersect_output) as file:
        for line in file:
            line = line.split("\t")
            junction = line[3]
            hk.add_key(junction, [], out_dict)
            # if this exon overlaps with peaks
            if line[6] != ".":
                out_dict[junction].append([])
                peak_start = int(line[7])
                peak_end = int(line[8])
                if line[5] == plus:
                    exon_start = int(line[1])
                    # loop over the nucleotides in the peak
                    for nt in range(peak_start, peak_end):
                        out_dict[junction][-1].append(nt - exon_start)
                else:
                    exon_end = int(line[2])
                    for nt in range(peak_start, peak_end):
                        out_dict[junction][-1].append(exon_end - nt - 1)
    # calculate the centres of the peaks
    out_dict_centres = {i: [int(np.median(j)) for j in out_dict[i]] for i in out_dict}
    # break up separate peaks and just record all the positions once
    if reads_mode:
        out_dict = {i: sorted(list(hk.flatten(out_dict[i]))) for i in out_dict}
    else:
        out_dict = {i: sorted(list(set(hk.flatten(out_dict[i])))) for i in out_dict}
    hk.remove_file(intersect_output)
    return(out_dict, out_dict_centres)
Пример #5
0
def get_reads_per_pos(reads_file, transcript_bed):
    """
    Given a BED file of reads and a BED file of transcript coordinates,
    make a dictionary with transcript IDs as keys and number of reads per position,
    as well as the absolute coordinates of the nucleotides, as values.
    :param reads_file: BED file with read coordinates
    :param transcript_bed: BED file with transcript coordinates
    :return: dictionary with numbers of reads per position
    """
    # intersect the transcripts and the reads, so you'd have an output file where
    # the transcript coordinates are followed by the overlapping read
    intermediate_file = "{0}_{1}read_per_pos_intermediate.bed".format(
        reads_file[:-4],
        transcript_bed.split("/")[-1][:-4])
    co.intersect_bed(transcript_bed,
                     reads_file,
                     force_strand=True,
                     write_both=True,
                     no_dups=False,
                     write_zero=False,
                     output_file=intermediate_file)
    reads_per_pos = {}
    total = hk.line_count(intermediate_file)
    print("Calculating the number of reads per position in each transcript...")
    with open(intermediate_file, newline="") as file:
        file_reader = csv.reader(file, delimiter="\t")
        for pos, line in enumerate(file_reader):
            if pos % 100000 == 0:
                print("{0}/{1}".format(pos, total))
            # prefix the chromosome and the strand to the transcript name cause you'll
            # need it later
            trans_name = line[3]
            trans_name = "{0}.{1}.{2}".format(line[0], line[5], trans_name)
            reads_per_pos = hk.add_key(trans_name, {"reads": {}},
                                       reads_per_pos)
            strand = line[5]
            if strand == "+":
                position = int(line[8]) - 1
            else:
                position = int(line[7])
            reads_per_pos[trans_name]["reads"] = hk.add_key(
                position, 0, reads_per_pos[trans_name]["reads"])
            reads_per_pos[trans_name]["reads"][
                position] = reads_per_pos[trans_name]["reads"][position] + 1
            reads_per_pos[trans_name] = hk.add_key(
                "coords", (int(line[1]), int(line[2])),
                reads_per_pos[trans_name])
    hk.remove_file(intermediate_file)
    return reads_per_pos
Пример #6
0
def get_pairwise_alignment(coords, coords_file, query_species, other_species, version, output_file):
    '''
    Given a list of feature coordinates and two species, get the corresponding pairwise alignments from Compara.
    '''
    #write the coordinates to file in a way that can be read by the downstream perl script
    with open(coords_file, "w") as file:
        for feature in coords:
            feature = [str(i) for i in feature]
            feature = "|".join(feature)
            file.write(feature)
            file.write("\n")
    remove_file(output_file)
    #get the alignments from the database
    run_process(["perl", "pairwise_from_ensembl.pl", coords_file, query_species, other_species, output_file, version])
    #parse them from the output file produced by the perl script
    with open(output_file) as file:
        string = "".join(file)
    string = string.split("***")
    string = [(i.rstrip("\n")).lstrip("\n") for i in string]
    string = [i.split("|||") for i in string]
    string = [[j for j in i if len(j) > 0] for i in string]
    #getting rid of cases where there's multiple GABs
    string = flatten([i for i in string if len(i) == 1])
    #write alignments to a pretty FASTA
    with open(output_file, "w") as file:
        for feature in string:
            temp = feature.split("\n")
            name = temp[0]
            name = name.split("|")
            antisense = False
            if name[6] == "-":
                antisense = True
            try:
                alignments = [temp[2], temp[3]]
                alignments = [i.split(" ") for i in alignments]
                #covert to upper case
                alignments = [([j for j in i if j][1]).upper() for i in alignments]
                #only keep alignments with no ambiguous bases in either sequence
                if "N" not in alignments[0] and "N" not in alignments[1]:
                    #reverse complement, if necessary
                    if antisense:
                        alignments = [str(Seq(i, IUPAC.unambiguous_dna).reverse_complement()) for i in alignments]
                    file.write(">{0}\n".format("|".join(name)))
                    file.write("|".join(alignments))
                    file.write("\n")
            except IndexError:
                pass
Пример #7
0
def intersect_bed(bed_file1, bed_file2, use_bedops = False, overlap = False, overlap_rec = False, write_both = False, sort = False, output_file = None,
                             force_strand = False, force_opposite_strand = False, no_name_check = False, no_dups = True, chrom = None, intersect = False, hit_count = False, bed_path = None, intersect_bam=None,
                  write_zero = False, write_bed = False, exclude = False):
    '''Use bedtools/bedops to intersect coordinates from two bed files.
    Return those lines in bed file 1 that overlap with intervals in bed file 2.
    OPTIONS
    output_file: write output to this file
    use_bedops: use bedops rather than bedtools. Certain options are only valid with one of the two, see below.
    overlap: minimum oxverlap required as a fraction of the intervals in bed file 1 (EX: 0.8 means that the
    overlap has to be at least 80% of the intervals in bed file 1).
    overlap_rec: require that the overlap as a fraction of the intervals in file 2 be at least as high as
    the threshold indicated in -f.
    write_both: if True, return not only the interval from bed file 1 but, tagged onto the end, also the
    interval from bed file 2 that it overlaps (only
    valid when using bedtools).
    exclude: if True, report intervals that DON'T overlap
    sort: sort bed files before taking the intersection
    force_strand: check that the feature and the bed interval are on the same strand (only valid with bedtools)
    force_opposite_strand: if True, check that the feature and the interval are on OPPOSITE strands
    no_name_check: if set to False, checks whether the chromosome names are the same in the too bed files (only valid with bedtools)
    no_dups: if True, only returns each interval once. If set to false, intervals in bed file 1 that overlap several intervals in
    bed file 2 will be returned several times (as many times as there are overlaps with different elements in bed file 2)
    chrom: limit search to a specific chromosome (only valid with bedops, can help in terms of efficiency)
    intersect: rather than returning the entire interval, only return the part of the interval that overlaps an interval in bed file 2.
    hit_count: for each element in bed file 1, return the number of elements it overlaps in bed file 2 (only valid with bedtools)
    intersect_bam: intersect a bam file with a bed file. Requires bam file to be called first
    write_zero: like write_both but also write A intervals that don't overlap with any B intervals,
    write_bed: when intersecting a bam file, write output as bed.'''
    if force_strand and force_opposite_strand:
        raise Exception("force_strand and force_opposite_strand can't both be True")
    hk.make_dir("temp_data/")
    temp_file_name = "temp_data/temp_bed_file{0}.bed".format(random.random())
    #have it write the output to a temporary file
    if use_bedops:
        bedtools_output = run_bedops(bed_file1, bed_file2, force_strand, force_opposite_strand, write_both, chrom, overlap, sort, output_file = temp_file_name, intersect = intersect, hit_number = hit_count, no_dups = no_dups, intersect_bam = intersect_bam, overlap_rec = overlap_rec)
    else:
        bedtools_output = run_bedtools(bed_file1, bed_file2, force_strand, force_opposite_strand, write_both, chrom, overlap, sort, no_name_check, no_dups, output_file = temp_file_name, intersect = intersect, hit_number = hit_count, bed_path = bed_path, intersect_bam = intersect_bam, write_zero = write_zero, overlap_rec = overlap_rec, write_bed = write_bed, exclude = exclude)
    #move it to a permanent location only if you want to keep it
    if output_file:
        hk.run_process(["mv", temp_file_name, output_file])
    else:
        bedtools_output = rw.read_many_fields(temp_file_name, "\t")
    hk.remove_file(temp_file_name)
    return(bedtools_output)
def get_sim_p_core(simulations, hits, controls, fasta, correspondances,
                   alignments, method, statistic, reverse_site_numbers,
                   degen_hits_file, degen_controls_file):
    '''
    Core function for get_sim_p.
    '''
    sim_norm_ds = []
    counter = 0
    for sim in simulations:
        counter = update_counter(counter, 10)

        #shuffle hits and controls
        if not reverse_site_numbers:
            temp_hits, temp_controls = shuffle_dictionaries(hits, controls)
        else:
            temp_controls, temp_hits = shuffle_dictionaries(hits, controls)

        hit_phylip = "temp_data/temp{0}.phy".format(random.random())
        control_phylip = "temp_data/temp{0}.phy".format(random.random())

        #write phylip alignments with the pseudo-hit and pseudo-control positions
        conservation.write_hits_to_phylip(fasta, temp_hits, hit_phylip,
                                          correspondances, alignments,
                                          degen_hits_file)
        conservation.write_hits_to_phylip(fasta, temp_controls, control_phylip,
                                          correspondances, alignments,
                                          degen_controls_file)

        #get PAML estimates
        hit_ds = conservation.run_codeml(hit_phylip,
                                         "temp_data/temp_{0}.phy".format(
                                             random.random()),
                                         method=method)[statistic]
        control_ds = conservation.run_codeml(control_phylip,
                                             "temp_data/temp_{0}.phy".format(
                                                 random.random()),
                                             method=method)[statistic]
        sim_norm_ds.append((hit_ds - control_ds) / control_ds)

        remove_file(hit_phylip)
        remove_file(control_phylip)
    return (sim_norm_ds)
Пример #9
0
def get_MSA_gene_list(coords, coords_file, method, species_set, version, query_species, MSA_file):
    '''
    Given a list of lists of CDS coordinates, retrieve the Compara MSAs.
    '''
    with open(coords_file, "w") as file:
        for trans in coords:
            for exon in coords[trans]:
                phase = exon[1]
                current_coords = exon[0]
                current_coords = [str(i) for i in current_coords]
                current_coords.append(str(phase))
                current_coords = "|".join(current_coords)
                file.write(current_coords)
                file.write("\n")
    remove_file(MSA_file)
    run_process(["perl", "MSA_list.pl", method, species_set, version, coords_file, query_species, MSA_file])
    with open(MSA_file) as file:
        string = "".join(file)
    string = re.sub("([a-z])\n([a-z])", "\\1\\2", string)
    with open(MSA_file, "w") as file:
        file.write(string)
Пример #10
0
def get_lambda(lambda_file_outroot, phy_file, subst_model, min_inf = None):
    '''
    Calculate lambda input parameter for INSIGHT.
    '''
    lambda_file = "{0}.mod".format(lambda_file_outroot)
    #to make sure you catch it if the phyloFit process fails
    remove_file(lambda_file)
    #from UCSC
    tree_file = "DFE/UCSC_model.mod"
    #subst_model is JC69, for instance
    #scale-only, cause you don't want it to estimate a new tree, just to scale the whole thing
    arguments = ["phyloFit", "--init-model", tree_file, "--out-root", lambda_file_outroot, "--subst-mod", subst_model,
                           "--msa-format", "PHYLIP", "--scale-only", phy_file]
    #must be set to False for testing
    if min_inf:
        arguments.extend(["-I", min_inf])
    results = run_process(arguments)
    with open(lambda_file) as file:
        lambda_b = file.read()
    lambda_b = re.findall(lambda_regex, lambda_b)[0]
    return(lambda_b)
Пример #11
0
def main():
    description = "Run mDFEest with shuffled input to check the false positive rate."
    args = parse_arguments(description, [
        "hits_file", "controls_file", "output_file", "n_sim", "SNP_file",
        "SNP_number", "hit_reduce", "control_reduce", "const_pop"
    ],
                           ints=[3, 5],
                           floats=[6, 7],
                           flags=[8])
    hits_file, controls_file, output_file, n_sim, SNP_file, SNP_number, hit_reduce, control_reduce, const_pop = args.hits_file, args.controls_file, args.output_file, args.n_sim, args.SNP_file, args.SNP_number, args.hit_reduce, args.control_reduce, args.const_pop

    with open(output_file, "w") as file:
        for sim in range(n_sim):
            print(sim)

            temp_hits_file = "temp_data/hits_file{0}.txt".format(
                random.random())
            temp_controls_file = "temp_data/controls_file{0}.txt".format(
                random.random())
            temp_input_file = "temp_data/input_file{0}.txt".format(
                random.random())

            #shuffle hits and controls for negative control
            run_process([
                "python3", "shuffle_hits_and_controls.py", hits_file,
                controls_file, temp_hits_file, temp_controls_file, hit_reduce,
                control_reduce
            ])

            #generate multiDFEest input file
            run_process([
                "python3", "mDFEest_input.py", temp_hits_file,
                temp_controls_file, SNP_file, SNP_number, temp_input_file
            ])

            output = mDFEest("beta", temp_input_file, pop_change=True)

            print(output)
            print(output["Nes_0.0_0.1"])
            print(output["Nes_0.1_1.0"])

            file.write("{0}\t{1}\t{2}".format(sim, output["Nes_0.0_0.1"],
                                              output["Nes_0.1_1.0"]))

            #if you also want to run with fixed population size
            if const_pop:
                output = mDFEest("beta", temp_input_file, pop_change=False)

                file.write("{0}\t{1}\t{2}".format(sim, output["Nes_0.0_0.1"],
                                                  output["Nes_0.1_1.0"]))

            file.write("\n")

            remove_file(temp_hits_file)
            remove_file(temp_controls_file)
            remove_file(temp_input_file)
Пример #12
0
def get_ancestral_CG(outroot, subst_model, phy_files, model_file, tuples_mapping_dict, anc_CG_file_name, high_CG = None, min_inf = None, macaque = False, comprehensive = False, from_model = False):
    '''
    Get a dictionary that says for each transcript which positions were ancestrally CpG/GpC.
    '''
    #if a file name hasn't been supplied or if the file with the supplied name doesn't exist, determine
    #CpG positions again, otherwise just read them in from the file
    if not anc_CG_file_name or anc_CG_file_name == "None" or not os.path.exists(anc_CG_file_name):
        #you need several in case you have a high_CG dictionary
        pps = []
        for phy_file in phy_files:
            if subst_model == "JC69" or from_model:
                #use an existing substitution model
                arguments = ["phyloFit", "--init-model", model_file, "--out-root", outroot, "--subst-mod", subst_model,
                                       "--msa-format", "PHYLIP", "--post-probs", "--scale-only", phy_file]
            else:
                #estimate a new model
                arguments = ["phyloFit", "--out-root", outroot, "--subst-mod", subst_model,
                                       "--msa-format", "PHYLIP", "--tree", "DFE/full_tree.tree", "--post-probs", phy_file]
                
            if subst_model == "JC69":
                block_size = 4
                tuple_pos_lim = 2
                shift_in_tuple = 0
            else:
                #for dinucleotide models
                block_size = 16
                tuple_pos_lim = 3
                shift_in_tuple = 9

            #turn off when testing                        
            if min_inf:
                arguments.extend(["-I", min_inf])
            results = run_process(arguments)
            #read in posterior probabilities of having various nucelotides ancestrally
            pp_file = "{0}.postprob".format(outroot)
            pp = rw.read_many_fields(pp_file, " ")
            pp = [[j for j in i if j] for i in pp]
            pp = pp[2:]
            #the posterior probability that you had a CpG at a position has to be greater
            #than threshold for a position to be counted as ancestrally CpG
            threshold = 0.5
            #will be over-written if you're doing big tree
            human_pos = 0
            #the outgroup nodes are labelled from the outside in, starting from 1
            if macaque:
                #it's to know whether we're doing big tree or little tree
                if len(pp[0]) == 14:
                    #little tree, mononucleotide
                    pp = {"_".join(i[1:tuple_pos_lim]): [i[len(i) - (3 * block_size): len(i) - (2 * block_size)]] for i in pp}
                elif len(pp[0]) > 14:
                    #big tree/dinucleotide (i.e. it'll give you nonsense if you're trying to do context with the little tree)
                    #the shift_in_tuple is to do with the fact that if you're doing U2S, you want the second tuple and not the first
                    human_pos = 3 + shift_in_tuple
                    if comprehensive:
                        #you want to get all nodes except for node 0, which is the outgroup-ingroup ancestor
                        pp = {"_".join(i[1:tuple_pos_lim]): [i[len(i) - (j * block_size): len(i) - ((j - 1) * block_size)] for j in range(1, 7)] for i in pp}
                    else:
                        pp = {"_".join(i[1:tuple_pos_lim]): [i[len(i) - (6 * block_size): len(i) - (5 * block_size)]] for i in pp}
                else:
                    #for tests etc. where you might only have, say, two species
                    pp = {"_".join(i[1:tuple_pos_lim]): [i[-block_size:]] for i in pp}
            else:
                pp = {"_".join(i[1:tuple_pos_lim]): [i[-block_size:]] for i in pp}
            pps.append(pp)
        anc_CG = {}
        #just to get the length
        example_pp = pps[0][list(pps[0].keys())[0]]
        for trans in tuples_mapping_dict:
            #tuples_mapping_dict has the alignment tuple corresponding to each position
            #because the phyloFit output is organized by tuples, not by positions
            anc_CG[trans] = []
            for node_pos in range(len(example_pp)):
                #if you're using dinucleotides
                if subst_model != "JC69":
                    for pos in sorted(tuples_mapping_dict[trans].keys())[1:]:
                        try:
                            pp_number = 0
                            #if you're gonna produce different output dictionaries for high and low GC regions
                            if high_CG:
                                if pos in high_CG[trans]:
                                    pp_number = 1
                            current_tuple = tuples_mapping_dict[trans][pos]
                            #don't consider positions where there is an alignment gap for human
                            if current_tuple[human_pos] != "*":
##                                print(current_tuple)
##                                print(pps[pp_number])
##                                print("\n")
                                if current_tuple in pps[pp_number]:
                                    current_pp = pps[pp_number][current_tuple][node_pos]
                                else:
                                    current_pp = pps[abs(pp_number - 1)][current_tuple][node_pos]
                                #because it can be either GC or CG, hence 6 or 9
                                if float(current_pp[6]) > threshold or float(current_pp[9]) > threshold:
                                    #you're always testing the second member in the dinucleotide
                                    anc_CG[trans].append(pos - 1)
                                    anc_CG[trans].append(pos)
                        except KeyError:
                            if pos % 100 == 0:
                                pass
                            else:
                                raise KeyError
                else:
                    #if you're using mononucleotides, you have to keep track of what the previous neuclotide was
                    C_prev = False
                    G_prev = False
                    for pos in sorted(tuples_mapping_dict[trans].keys()):
                        pp_number = 0
                        if high_CG:
                            if pos in high_CG[trans]:
                                pp_number = 1
                        current_C = False
                        current_G = False
                        current_tuple = tuples_mapping_dict[trans][pos]
                        if current_tuple[human_pos] != "*":
                            current_pp = pps[pp_number][current_tuple][node_pos]
                            #if current is C and previous was G
                            if float(current_pp[1]) > threshold:
                                if G_prev:
                                    anc_CG[trans].append(G_pos)
                                    anc_CG[trans].append(pos)
                                current_C = True
                            #if current is G and previous was C
                            if float(current_pp[2]) > threshold:
                                if C_prev:
                                    anc_CG[trans].append(C_pos)
                                    anc_CG[trans].append(pos)
                                current_G = True
                            C_prev = False
                            G_prev = False
                            if current_C:
                                C_prev = True
                                #you need to specify the position explicitly because it's not necessarily
                                #the last one if there were dashes
                                C_pos = pos
                            if current_G:
                                G_prev = True
                                G_pos = pos
            anc_CG[trans] = sorted(list(set(anc_CG[trans])))
        remove_file(pp_file)
        if anc_CG_file_name and anc_CG_file_name != "None":
            with open(anc_CG_file_name, "w") as file:
                for trans in anc_CG:
                    to_write = "\t".join([trans, ",".join([str(i) for i in anc_CG[trans]])])
                    file.write(to_write)
                    file.write("\n")
    else:
        #parse
        anc_CG = rw.read_many_fields(anc_CG_file_name, "\t")
        anc_CG = [i for i in anc_CG if len(i) == 2]
        anc_CG = list_to_dict(anc_CG, 0, 1)
        anc_CG = {i: [int(i) for i in anc_CG[i].split(",") if i != ""] for i in anc_CG}
    return(anc_CG)
def get_new_method_results(hit_file,
                           control_file,
                           hit_phylip,
                           control_phylip,
                           correspondances,
                           alignments,
                           fasta,
                           baseml=False,
                           return_CpG=False,
                           global_fasta=None,
                           return_overall=False,
                           motifs=None,
                           fs=None,
                           regions=False):
    '''
    Calculate normalized dS.
    '''
    #if you're meant to ignore degenerate substitutions,
    #the degeneracy file will have been supplied as the hit file
    #and the real hit file name can be derived from the name of the
    #degeneracy file
    if "_degen.txt" in hit_file:
        degen_hits_file = hit_file
        degen_controls_file = control_file
        hit_file = hit_file[:-10]
        control_file = control_file[:-10]
    else:
        degen_hits_file = None
        degen_controls_file = None
    #read in hit and control positions
    hits = parse_basinhoppin_pos(hit_file)
    controls = parse_basinhoppin_pos(control_file)

    try:
        #write control and hit sequences to PHYLIP files
        conservation.write_hits_to_phylip(fasta,
                                          hits,
                                          hit_phylip,
                                          correspondances,
                                          alignments,
                                          degen_hits_file,
                                          baseml=baseml,
                                          fs=fs,
                                          regions=regions,
                                          global_fasta=global_fasta)
        conservation.write_hits_to_phylip(fasta,
                                          controls,
                                          control_phylip,
                                          correspondances,
                                          alignments,
                                          degen_controls_file,
                                          baseml=baseml,
                                          fs=fs,
                                          regions=regions,
                                          global_fasta=global_fasta)

        #if you're doing nucleotide-based rather than codon-based
        if baseml:
            method = "baseml"
            statistic = "tree length"
        else:
            method = "gy"
            statistic = "dS"

        #if you want to return the density * normalized dS statistic, you need the density
        if return_overall:
            density = nc.get_sequence_set_density(fasta,
                                                  None,
                                                  motifs,
                                                  None,
                                                  False,
                                                  "temp_data/temp_dens1.txt",
                                                  "temp_data/temp_dens2.txt",
                                                  "temp_data/temp_pos.txt",
                                                  None,
                                                  feature_set=fs,
                                                  concat=True,
                                                  positions=False)["density"]
            print("Density: {0}.".format(density))

        #get dS estimates from PAML
        hit_ds = conservation.run_codeml(hit_phylip,
                                         "temp_data/temp_{0}.phy".format(
                                             random.random()),
                                         method=method)[statistic]
        control_ds = conservation.run_codeml(control_phylip,
                                             "temp_data/temp_{0}.phy".format(
                                                 random.random()),
                                             method=method)[statistic]

        remove_file(control_phylip)

        #report CpG frequency in hits vs controls
        hit_freq, control_freq = CpG_frequency(fasta, hits, controls)
        print("Hit dS: {0}.".format(hit_ds))
        print("Control dS: {0}.".format(control_ds))
        norm_ds = (hit_ds - control_ds) / control_ds
        print("Normalized dS: {0}.\n".format(norm_ds))

        if return_overall:
            overall = norm_ds * density
            print("Overall decrease: {0}.\n".format(overall))
            return (norm_ds, density, overall)

        if return_CpG:
            return (norm_ds, hit_freq, control_freq)
        return ((hit_ds - control_ds) / control_ds)
    except conservation.NoDataException:
        print("No input sequence available.")
        if return_CpG:
            return (None, None, None)
        return (None)
Пример #14
0
def main():
    description = "Directly compare the frequency of segregating sites/mean allele frequency between hits and controls."
    args = parse_arguments(description, [
        "hit_file", "control_file", "INSIGHT_hit_file", "INSIGHT_control_file",
        "SFS_file", "trial_file", "trials", "shuffle"
    ],
                           ints=[6],
                           flags=[7])
    hit_file, control_file, INSIGHT_hit_file, INSIGHT_control_file, SFS_file, trial_file, trials, shuffle = args.hit_file, args.control_file, args.INSIGHT_hit_file, args.INSIGHT_control_file, args.SFS_file, args.trial_file, args.trials, args.shuffle

    true_hits = rw.read_pos(hit_file)
    true_controls = rw.read_pos(control_file)

    #to store the original data in case this is a negative control and you will be shuffling
    #hits and controls
    original_INSIGHT_hit_file = INSIGHT_hit_file
    original_INSIGHT_control_file = INSIGHT_control_file

    print(hit_file)

    with open(trial_file, "w") as file:
        file.write(
            "trial\tpoly_fraction_hits - poly_fraction_controls\tmedian_hit_MAF - median_control_MAF\n"
        )
        for trial in range(trials):
            to_write = "{0}\t".format(trial)

            #if this is a negative control
            if shuffle:
                INSIGHT_hit_file = re.sub("_0_", "_{0}_".format(trial),
                                          original_INSIGHT_hit_file)
                INSIGHT_control_file = re.sub("_0_", "_{0}_".format(trial),
                                              original_INSIGHT_control_file)
                temp_hits_file = "temp_data/temp_hits{0}.txt".format(
                    random.random())
                temp_controls_file = "temp_data/temp_controls{0}.txt".format(
                    random.random())
                #shuffle hits and controls
                temp_hits, temp_controls = shuffle_dictionaries(
                    true_hits, true_controls)
                rw.write_pos(temp_hits, temp_hits_file)
                rw.write_pos(temp_controls, temp_controls_file)
                SFS_file = "temp_data/temp_SFS_file{0}.txt".format(
                    random.random())
                #generate an ISNIGHT input file that you could then use for the manual analysis
                run_process([
                    "python3", "mDFEest_input.py", temp_hits_file,
                    temp_controls_file,
                    "general/1000genomes/filtered_hg38_85_pc_multiexon_Yoruban_SNPs_relative.txt",
                    216, SFS_file
                ])
                remove_file(temp_hits_file)
                remove_file(temp_controls_file)

            hit_data = get_data(INSIGHT_hit_file)
            control_data = get_data(INSIGHT_control_file)

            poly_ratio_diff = get_chisq_site_freq(hit_data, control_data)
            to_write = to_write + "{0}\t".format(poly_ratio_diff)

            temp, median_diff = get_mean_freq(SFS_file)
            to_write = to_write + "{0}\n".format(median_diff)

            if shuffle:
                remove_file(SFS_file)

            file.write(to_write)
def main():
    description = "Calculate the normalized dS of a dataset."
    args = parse_arguments(description, [
        "dataset", "feature_set", "genome", "families_file", "fasta",
        "hit_file_prefix", "motifs_file", "correspondances", "alignments",
        "suffix", "trials", "trial_file", "old_trial_file", "region_fasta",
        "old_motif_format", "nonsense", "no_families", "newest_only",
        "top_set_only", "calc_p", "reverse_site_numbers", "matched", "degen",
        "regions"
    ],
                           ints=[10],
                           flags=[14, 15, 16, 17, 18, 19, 20, 21, 22, 23])
    dataset, feature_set, genome, families_file, fasta, hit_file_prefix, motifs_file, correspondances, alignments, suffix, trials, trial_file, old_trial_file, region_fasta, old_motif_format, nonsense, no_families, newest_only, top_set_only, calc_p, reverse_site_numbers, matched, degen, regions = args.dataset, args.feature_set, args.genome, args.families_file, args.fasta, args.hit_file_prefix, args.motifs_file, args.correspondances, args.alignments, args.suffix, args.trials, args.trial_file, args.old_trial_file, args.region_fasta, args.old_motif_format, args.nonsense, args.no_families, args.newest_only, args.top_set_only, args.calc_p, args.reverse_site_numbers, args.matched, args.degen, args.regions

    n_sim = 1000

    print(suffix)

    #set up feature set and families
    fs = Feature_Set(feature_set, genome)
    fs.set_dataset(dataset)
    if no_families:
        picked = fs.names
    else:
        families = rw.read_families(families_file)
        fs.add_families(families)
        picked = fs.pick_random_members()

    hit_phylip = "temp_data/temp_{0}.phy".format(random.random())
    control_phylip = "temp_data/temp_control_{0}.phy".format(random.random())

    if not nonsense:
        if old_motif_format:
            motifs = rw.read_names(motifs_file)[1:]
        else:
            motifs = rw.read_motifs(motifs_file)
            if top_set_only:
                summary_data = rw.read_many_fields(
                    "RBP/RBP_hg38_introncontaining_new.txt", "\t")
                summary_dict = list_to_dict(summary_data, 0, 4, floatify=True)
                motifs = {
                    RBP: motifs[RBP]
                    for RBP in motifs if (summary_dict[RBP] < 0.1)
                }
            motifs = list(set(flatten(motifs.values())))

    if reverse_site_numbers:
        site_number_suffix = "_reversed_site_numbers_"
    else:
        site_number_suffix = ""

    if matched:
        matched_suff = "_matched"
    else:
        matched_suff = ""

    if degen:
        degen_suff = "_degen.txt"
    else:
        degen_suff = ""

    with open(trial_file, "w") as trial_out:

        trial_out.write(
            "trial\tA\tT\tC\tG\told\told_no_hum_CG\tnew_no_human_CG\tnew_no_hum_no_anc_CG\tnew_w_CG\tnew_no_anc_CG\tnew_no_anc_CG_macaque\tnewer_no_human_CG\tnewer_no_hum_no_anc_CG\tnewer_w_CG\tnewer_no_anc_CG\n"
        )
        if old_trial_file != "None":
            old_trials = rw.read_many_fields(old_trial_file, "\t")
            old_trials = old_trials[1:]
            old_trials = [i[1:5] for i in old_trials]
            seed_kmers = 1
        else:
            seed_kmers = None

        #you can do this for loads of trials
        #useful as a negative control if you're generating a new set of nonsense motifs
        #each time
        for trial in range(trials):

            print(trial)

            trial_output = [trial]

            #if you're meant to generate a load of nonsense motifs rather than using real motifs
            if nonsense:
                if old_trial_file != "None":
                    #read in the intended nucleotide composition of the nonsense
                    #motifs from file
                    scaled_comp = [float(i) for i in old_trials[trial]]
                else:
                    #pick nonsense motifs nucleotide composition by chance
                    comp = [random.random() for i in range(4)]
                    scaled_comp = [i / np.sum(comp) for i in comp]
                comp_dict = {
                    i: scaled_comp[pos]
                    for pos, i in enumerate(nc._canon_bases_)
                }
                motifs, obtained_dict = nc.kmers_from_nc(6,
                                                         50,
                                                         comp_dict=comp_dict,
                                                         return_freqs=True,
                                                         seed=seed_kmers)
                motifs = ["motifs"] + motifs
                trial_output = trial_output + [
                    obtained_dict[i] for i in nc._canon_bases_
                ]
                temp_motifs_file = "temp_data/temp_motifs.txt"
                rw.write_names(motifs, temp_motifs_file)

            print(
                "===NEW METHOD WITH NO ANCESTRAL CpG (MACAQUE, BIG TREE, CONTEXT), REPLACEMENT CONTROL==="
            )
            hit_file = "{0}_hits_no_anc_CG_only_macaque_big_context{1}_replace.txt{2}".format(
                hit_file_prefix, matched_suff, degen_suff)
            control_file = "{0}_controls_no_anc_CG_only_macaque_big_context{1}_replace.txt{2}".format(
                hit_file_prefix, matched_suff, degen_suff)
            if nonsense:
                hit_file = "temp_data/temp_hits{0}.txt".format(random.random())
                control_file = "temp_data/temp_controls{0}.txt".format(
                    random.random())
                error_file = "temp_data/temp_error{0}.txt".format(
                    random.random())
                get_control_sites(
                    fasta, genome, feature_set, families_file, dataset,
                    temp_motifs_file, hit_file, control_file, error_file,
                    "DFE/for_everybody/filtered_hg38_85_pc_multiexon_anc_CG_big_context_threshold05.txt",
                    [
                        "--leave_CG", "--context", "--remove_ancestral_CpG",
                        "--macaque_anc", "--big_tree", "--replacement_control"
                    ])
            get_density(fasta, motifs, fs)
            norm_ds = get_new_method_results(hit_file,
                                             control_file,
                                             hit_phylip,
                                             control_phylip,
                                             correspondances,
                                             alignments,
                                             fasta,
                                             regions=regions,
                                             global_fasta=region_fasta,
                                             fs=fs)
            trial_output.append(norm_ds)
            if calc_p:
                p, low_CI, high_CI, sd, Z = get_sim_p(
                    norm_ds,
                    hit_file,
                    control_file,
                    correspondances,
                    alignments,
                    fasta,
                    n_sim,
                    reverse_site_numbers=reverse_site_numbers,
                    sim_ds_file=
                    "{0}{1}_sim_norm_ds_no_anc_CG_only_macaque_big_context{2}_replace.txt{3}"
                    .format(hit_file_prefix, site_number_suffix, matched_suff,
                            degen_suff))

            trial_output = "\t".join([str(i) for i in trial_output])
            trial_out.write(trial_output)
            trial_out.write("\n")

            remove_file(hit_phylip)
Пример #16
0
def get_CpG_dicts(CDSs, chroms, MSA_file_name_prefix, lengths, clean_names, phylip_data, fasta, anc_CG_file_name, high_CG_file_name, fs, macaque_anc = False, pseudoCG = False, comprehensive = False, subst_model = None, return_tuples = False, regions = False):
    '''
    Get two dictionaries, one that says for each transcript which positions are CpG/GpC in macaque
    and one which positions were likely CpG/GpC in the human-macaque ancestor.
    '''
    names, seqs = rw.read_fasta(fasta)
    #if you're gonna determine ancestral CpG positions from scratch rather than reading them in from an existing file
    #if you want to have the name of the file determined automatically
    if (not anc_CG_file_name) or (anc_CG_file_name == "None"):
        new_CG = True
        phy_file = "temp_data/temp_anc_CG{0}.txt".format(random.random())
    #if you want to give the file a name yourself
    elif not os.path.exists(anc_CG_file_name):
        new_CG = True
    else:
        new_CG = False

    if new_CG:
        print("Will get new CpG data...")
        if len(phylip_data) < 8 and comprehensive:
            print("Comprehensive CpG filtering only in big tree mode!")
            raise Exception
        #if you want to pretend some other dinucleotide are CpG
        if pseudoCG:
            CG_kmers = ["C[\-]*T", "A[\-]*G"]
        #the hyphens are there in case the two nucleotides are separated by an indel
        else:
            CG_kmers = ["C[\-]*G", "G[\-]*C"]
        CG_kmers = [re.compile(i) for i in CG_kmers]
        macaque_CG_dict = {}

        anc_CG_concat_full = [[[""]], [[""]]]
        tuples_mapping_dict_full = {}

        for chrom in chroms:

            print(chrom)

            #only leave those CDSs that are on the current chromosome
            current_CDSs = {i: CDSs[i] for i in CDSs if CDSs[i][0][0][0] == chrom}
            coords_file = "temp_data/coords_file{0}.txt".format(random.random())

            #check if the MSA is already at the specified location, otherwise retrieve it
            MSA_file = "{0}_{1}.txt".format(MSA_file_name_prefix, chrom)
            if not os.path.isfile(MSA_file):
                print("Obtaining MSA...")
                eo.get_MSA_gene_list(current_CDSs, coords_file, "EPO", "primates", 85, "homo_sapiens", MSA_file)
                os.remove(coords_file)
                eo.flush_tables("localhost", "mysql", "fackel")
            MSA_raw = eo.parse_MSA_output(MSA_file)
            if high_CG_file_name != "None":
                high_CG = rw.read_many_fields(high_CG_file_name, "\t")
                high_CG = {i[0]: [int(j) for j in i[1:]] for i in high_CG}
            else:
                high_CG = None
            #get concatenated sequences (for determining ancestral CpG positions) and macaque CpG information for this chromosome
            anc_CG_concat, macaque_CG_dict, tuples_mapping_dict = get_CpG_dicts_core(MSA_raw, lengths, phylip_data, CG_kmers, macaque_anc, macaque_CG_dict, high_CG, comprehensive = comprehensive, subst_model = subst_model)
            remove_file(coords_file)
            #add that information to the global dictionaries
            anc_CG_concat_full, tuples_mapping_dict_full = update_anc_CG(anc_CG_concat_full, anc_CG_concat, tuples_mapping_dict_full, tuples_mapping_dict)
            
        phy_files = write_anc_CG(anc_CG_concat_full, anc_CG_file_name, clean_names, macaque_CG_dict)
        pp_file = anc_CG_file_name

    else:
        print("Will read in existing CpG data...")
        pp_file = None
        phy_files = "None"
        high_CG = None
        tuples_mapping_dict_full = None
        macaque_CG_file_name = "{0}_macaque.txt".format(anc_CG_file_name[:-4])
        macaque_CG_dict = rw.read_many_fields(macaque_CG_file_name, "\t")
        macaque_CG_dict = [i for i in macaque_CG_dict if len(i) == 2]
        macaque_CG_dict = list_to_dict(macaque_CG_dict, 0, 1)
        macaque_CG_dict = {i: [int(i) for i in macaque_CG_dict[i].split(",") if i != ""] for i in macaque_CG_dict}
    anc_CG_dict = get_ancestral_CG(pp_file, subst_model, phy_files, "DFE/UCSC_model.mod", tuples_mapping_dict_full, anc_CG_file_name, high_CG = high_CG, macaque = macaque_anc, comprehensive = comprehensive)
    [remove_file(i) for i in phy_files]
    #if you're looking at exon cores/flanks rather than full CDSs
    if regions:
        #you need to have matching bed/fasta files for this to work (with the records in the same order)
        bed = fasta.replace("fasta", "bed")
        transcripts = fs.get_transcripts()
        #for each flank/core, figure out what positions it covers in the full CDS
        mapping_dict = conservation.map_regions_to_CDS(fasta, bed, fs, transcripts, CDSs, trans_ids = True)
        anc_CG_dict = region_CpG(mapping_dict, anc_CG_dict)
    if return_tuples:
        return(anc_CG_dict, macaque_CG_dict, tuples_mapping_dict_full)
    else:
        return(anc_CG_dict, macaque_CG_dict)
Пример #17
0
def main():
    description = "Run mDFEest."
    args = parse_arguments(description, ["hit_file", "control_file", "SNP_file", "SNP_number", "input_file", "output_file", "seed", "fixed_model", "new_input", "shuffle", "fix_pop_change"], ints = [3], flags = [8, 9, 10])
    hit_file, control_file, SNP_file, SNP_number, input_file, output_file, seed, fixed_model, new_input, shuffle, fix_pop_change = args.hit_file, args.control_file, args.SNP_file, args.SNP_number, args.input_file, args.output_file, args.seed, args.fixed_model, args.new_input, args.shuffle, args.fix_pop_change

    #if you want to generate a new input file rather than reading in an existing one
    if new_input:
        remove_file("../multidfe/{0}".format(input_file.split("/")[-1]))
        arguments = ["python3", "mDFEest_input.py", hit_file, control_file, SNP_file, SNP_number, input_file]
        if shuffle:
            arguments.append("--shuffle")
        run_process(arguments)
    
    if seed == "None":
        seed = None
    else:
        seed = float(seed)

    #if you want to run it only with a population size change model,
    #rather than both a model assuming population size change and a fixed population
    #size model
    if fix_pop_change:
        pop_change = [True]
    else:
        pop_change = [False, True]

    if fixed_model == "None":
        #all possible models
        allowed = ["lognormal", "gamma", "beta", "spikes", "steps", "fixed six spikes"]
        spike_range = [2, 6]
    else:
        #only the spcified model
        allowed = [fixed_model]
        #only two-spike models
        spike_range = [2, 3]

    with open(output_file, "w") as file:
        file.write("model\tpop_change\tAIC\tNes_0.0_0.1\tNes_0.1_1.0\tNes_1.0_10.0\tNes_10.0_100.0\traw\n")
        for change_mode in pop_change:
    
            print("\nPopulation expansion: {0}.".format(str(change_mode)))

            if "lognormal" in allowed:
                print("lognormal model:")
                output = mDFEest("lognormal", input_file, pop_change = change_mode, seed = seed)
                print(output)
                write_mDFEest_output(output, file, change_mode)

            if "gamma" in allowed:
                print("gamma model:")
                output = mDFEest("gamma", input_file, pop_change = change_mode, seed = seed)
                print(output)
                write_mDFEest_output(output, file, change_mode)

            if "beta" in allowed:
                print("beta model:")
                output = mDFEest("beta", input_file, pop_change = change_mode, seed = seed)
                print(output)
                write_mDFEest_output(output, file, change_mode)

            for spike_number in range(spike_range[0], spike_range[1]):

                if "spikes" in allowed:
                    print("{0}-spikes model:".format(spike_number))
                    output = mDFEest("spikes", input_file, n_spikes = spike_number, seed = seed, repetitions = 10, pop_change = change_mode)
                    print(output)
                    write_mDFEest_output(output, file, change_mode)

                if "steps" in allowed:
                    print("{0}-steps model:".format(spike_number))
                    output = mDFEest("steps", input_file, n_spikes = spike_number, seed = seed, repetitions = 10, pop_change = change_mode)
                    print(output)
                    write_mDFEest_output(output, file, change_mode)

            if "fixed six spikes" in allowed:
                print("fixed six spikes model:")
                output = mDFEest("six_spikes", input_file, pop_change = change_mode, seed = seed)
                print(output)
                write_mDFEest_output(output, file, change_mode)
def main():
    description = "Given a BED file of reads, filter out reads whose " \
                  "3' end maps to the last nucleotide of an intron or" \
                  "the last nucleotide of an exon."
    args = hk.parse_arguments(description, ["reads_file", "gtf", "outfile"])
    reads_file, gtf, outfile = args.reads_file, args.gtf, args.outfile

    print("Getting intron lariat positions...")

    # read in exon coordinates
    exons = rw.read_gtf(gtf, element="exon", gene=False)
    # make a BED file with the last positions of introns
    intron_lariat_bed = "{0}_intron_lariat_pos_all_exons.bed".format(reads_file[:-4])
    co.write_intron_lariat_pos_from_exons(exons, intron_lariat_bed, add_chr = True)

    # intersect the reads with intron lariat positions
    intron_lariat_intersect_file_name = "{0}_intersect_with_intron_lariat_pos_all_exons.bed".format(reads_file[:-4])
    co.intersect_bed(reads_file, intron_lariat_bed, force_strand=True, write_both=True, no_dups=False, output_file=intron_lariat_intersect_file_name)
    hk.remove_file(intron_lariat_bed)
    intron_lariat_reads_file = "{0}_intron_lariat_reads_all_exons.bed".format(reads_file[:-4])
    # check that the reads end exactly at intron lariat positions
    check_3prime_match(intron_lariat_intersect_file_name, intron_lariat_reads_file)
    hk.remove_file(intron_lariat_intersect_file_name)

    # write BED with the last positions of exons
    splice_intermediate_bed = "{0}_splice_intermediate_pos_all_exons.bed".format(reads_file[:-4])
    co.write_si_pos_from_exons(exons, splice_intermediate_bed, add_chr = True)

    print("Getting splice intermediate positions.")

    # intersect the reads with splice intermediate positions
    splice_intermediate_intersect_file_name = "{0}_intersect_with_SI_pos_all_exons.bed".format(reads_file[:-4])
    co.intersect_bed(reads_file, splice_intermediate_bed, force_strand=True, write_both=True, no_dups=False, output_file=splice_intermediate_intersect_file_name)
    hk.remove_file(splice_intermediate_bed)
    SI_reads_file = "{0}_SI_reads_all_exons.bed".format(reads_file[:-4])
    # check that the reads end exactly at the end of the exon
    check_3prime_match(splice_intermediate_intersect_file_name, SI_reads_file)
    hk.remove_file(splice_intermediate_intersect_file_name)

    print("Concatenating the two files.")

    # concatenate the IL and SI read files so you could exclude both in one go
    combined_file = "{0}_SI_and_IL_reads_all_exons.bed".format(reads_file[:-4])
    hk.run_process(["cat", SI_reads_file, intron_lariat_reads_file], file_for_output=combined_file)

    hk.remove_file(SI_reads_file)
    hk.remove_file(intron_lariat_reads_file)

    # do an exclusive intersect, requiring 1.0 overlap for both A and B, to remove the
    # putative intron lariat reads from the main reads file
    co.intersect_bed(reads_file, combined_file, overlap=1, overlap_rec=1, force_strand=True, no_dups=False, exclude=True, output_file=outfile)

    hk.remove_file(combined_file)
Пример #19
0
def mDFEest(model, input_file, n_spikes = None, repetitions = None, fold_SFS = True, pop_change = False, seed = None):
    '''
    Wraps call to multiDFEest.
    '''
    flags = []

    if fold_SFS:
        fold_SFS = 1
    else:
        fold_SFS = 0
    #this looks weird but is normal: this value will be the value of conpop in the multiDFE call, meaning it'll be 1 with constant population size
    if pop_change:
        pop_change = 0
    else:
        pop_change = 1

    #convert the English distribution names into multiDFEest model codes
    if model == "lognormal":
        model_code = 4
        #parameter number for calculating AIC
        par_number = 2
    elif model == "gamma":
        model_code = 2
        par_number = 2
    elif model == "beta":
        model_code = 3
        par_number = 2
    elif model == "spikes":
        model_code = 0
        if not n_spikes:
            print("To be able to use a spikes model, you need to specify the number of spikes.")
            raise Exception
        par_number = (2 * n_spikes) - 1
        flags = ["-ranrep", repetitions, "-nspikes", n_spikes]
    elif model == "steps":
        model_code = 1
        if not n_spikes:
            print("To be able to use a steps model, you need to specify the number of steps.")
            raise Exception
        par_number = (2 * n_spikes) - 1
        flags = ["-ranrep", repetitions, "-nspikes", n_spikes]
    elif model == "six_spikes":
        model_code = 5
        par_number = 5
        flags = ["-ranrep", repetitions]
    else:
        print("{0} is not a valid model name!".format(model))
        raise Exception

    input_file_short = input_file.split("/")
    input_file_short = input_file_short[-1]

    #do the analysis in the directory where multiDFEest is stored
    if not os.path.exists("../multidfe/{0}".format(input_file_short)):
        run_process(["cp", input_file, "../multidfe"])
    MDE_output = "{0}.MAXL.out".format(input_file_short)
    current_dir = os.getcwd()
    os.chdir("../multidfe")
    arguments = ["./MultiDFE", "-N1", 100, "-conpop", pop_change, "-sfsfold", fold_SFS, "-selmode", model_code, "-file", input_file_short]
    if seed:
        seed_string = "GSL_RNG_SEED={0}".format(seed)
        arguments = [seed_string] + arguments
    arguments.extend(flags)
    print(" ".join([str(i) for i in arguments]))
    #run multiDFEest
    run_process(arguments)
    #parse output
    output = rw.read_many_fields(MDE_output, "\t")[0]
    output = [i.split(":") for i in output if ":" in i]
    output = {i[0]: float(i[1]) for i in output}
    #get the log likelihood and calculate AIC
    ll = output["L"]
    print("\n")
    print(par_number)
    print(ll)
    AIC = (2 * par_number) - (2 * ll)
    output["AIC"] = AIC
    if n_spikes:
        output["model"] = "{0}_{1}".format(model, n_spikes)
    else:
        output["model"] = model
    remove_file(MDE_output)
    os.chdir(current_dir)
    return(output)
Пример #20
0
def get_ss_strength(exons,
                    genome_file,
                    upstream=True,
                    five=True,
                    exonic=3,
                    intronic=6):
    """
    Given a set of exons, get an estimate of splice site strength.
    :param exons: Dictionary of CDS lines.
    :param genome_file: File with genome sequence.
    :param upstream: evaluate the (5' or 3') splice site of the upstream intron (rather than downstream)
    :param five: evaluate the 5' splice site (rather than 3')
    :param exonic: how many nucleotides to include from the exon
    :param intronic: how many nucleotides to include from the intron
    :return: a dictionary with the splice site strength for each exon
    """
    # will contain the splice site strengths
    out_dict = {}
    # will contain the names of the exons so that later on, we'd know which
    # splice site strength value goes with which exon
    names = []

    # write splice site coordinates to GTF
    hk.make_dir("temp_data")
    temp_file_name = "temp_data/ss_sequences.gtf"
    with open(temp_file_name, "w") as temp_file:
        writer = csv.writer(temp_file, delimiter="\t")
        for transcript in exons:
            curr_exons = exons[transcript]
            for pos, exon in enumerate(curr_exons):
                # don't analyze first exons
                if (pos != 0):
                    # cause you can't do the downstream intron of the last exon
                    if (upstream or (pos != len(curr_exons) - 1)):
                        if five:
                            if upstream:
                                template = curr_exons[pos - 1].copy()
                            else:
                                template = exon.copy()
                            if template[6] == "+":
                                template[3] = template[4] - exonic + 1
                                template[4] = template[4] + intronic
                            elif template[6] == "-":
                                template[4] = template[3] + exonic - 1
                                template[3] = template[3] - intronic
                        else:
                            if upstream:
                                template = exon.copy()
                            else:
                                template = curr_exons[pos + 1].copy()
                            if template[6] == "+":
                                template[4] = template[3] + exonic - 1
                                template[3] = template[3] - intronic
                            elif template[6] == "-":
                                template[3] = template[4] - exonic + 1
                                template[4] = template[4] + intronic
                        # this is for scaffolds etc.
                        if template[3] >= 0:
                            # so you'd know the order of the values in the MaxEntScan output
                            names.append("{0}.{1}".format(transcript, pos - 1))
                            writer.writerow(template)

    # make a FASTA with splice site sequences
    temp_fasta_file_name = "{0}.fasta".format(temp_file_name[:-4])
    hk.run_process([
        "bedtools", "getfasta", "-fi", genome_file, "-bed", temp_file_name,
        "-fo", temp_fasta_file_name, "-s"
    ])
    # filter FASTA for Ns
    fasta_lines = []
    with open(temp_fasta_file_name) as fasta:
        for line in fasta:
            if line[0] == ">":
                curr_name = line
            else:
                if "N" not in line:
                    fasta_lines.append(curr_name)
                    fasta_lines.append(line)
    with open(temp_fasta_file_name, "w") as fasta:
        for line in fasta_lines:
            fasta.write(line)

    # run MaxEntScan on the FASTA
    # lazy hardcoded path, replace as appropriate...
    mes_direct = "/Users/rsavisaar/Software/MaxEntScan/fordownload"
    if five:
        cmd = "/Users/rsavisaar/Software/MaxEntScan/fordownload/score5.pl"
    else:
        cmd = "/Users/rsavisaar/Software/MaxEntScan/fordownload/score3.pl"
    temp_mes_file_name = "{0}_mes.txt".format(temp_file_name[:-4])
    hk.run_process(["perl", cmd, temp_fasta_file_name],
                   file_for_output=temp_mes_file_name,
                   verbose=True)
    hk.remove_file(temp_fasta_file_name)
    hk.remove_file(temp_file_name)

    # read in splice site scores and store in output directory
    with open(temp_mes_file_name, newline="") as mes_file:
        reader = csv.reader(mes_file, delimiter="\t")
        for pos, line in enumerate(reader):
            out_dict[names[pos]] = float(line[1])
    hk.remove_file(temp_mes_file_name)
    return (out_dict)