Exemplo n.º 1
0
    def get_cluster_elements(self):
        # need to search for elements in the fasta file
        # going to assume that headers is always made to the first space
        fasta = self.parent_file.split('.')[0]
        # fasta duplicate always made by cd hit which has the same
        # file name but no extension as the clstr file
        try:
            lines = []
            fasta_elements = []
            search_dict = {}
            with open(self.parent_file) as parent:
                lines = read_as_tuples(parent)

            for element_tuple in lines:
                search_dict[element_tuple[0].split(' ')[0]] = element_tuple
                # use up to first space as key

            for clstr_element in self.elements:
                if clstr_element.name in search_dict:
                    fasta_elements.append(search_dict[clstr_element.name])
                # adds all elements in a cluster to the fasta elements list

            return fasta_elements  # list of tuples

        except FileNotFoundError as e:
            return e
Exemplo n.º 2
0
    def write_cluster_fastas(self, original_fasta_path, path):
        '''
        Iterates through all clusters in the cluster set and creates a fasta
        file of the elements in those clusters. Path is where new dir containing
        a cluster will be written to. The intention being that then can use
        consensus_tools to create consensus seqs from each of the cluster
        files. Returns the path files are written to.
        '''
        search_dict = {}  # assumes full path
        lines = read_as_tuples(original_fasta_path)
        # all clusters of a file from the same original fasta

        for element_tuple in lines:
            search_dict[element_tuple[0].split(' ')[0]] = element_tuple

        fasta_paths = []  # store the paths of written files
        for cluster in self.clusters_set:
            write_list = []  # contains elements in cluster to be written to file
            # iterate through all clusters
            file_name = os.path.join(path, '{}_cluster_{}'.format(
                os.path.basename(cluster.parent_file.split('.')[0]), cluster.num.strip()))
            fasta_paths.append(file_name)
            count = 0
            for clstr_element in cluster.elements:

                if clstr_element.name in search_dict:
                    write_list.append(search_dict[clstr_element.name])
            # write all elements found in dictionart to cluster fasta file
            write_from_tuple_list(write_list, file_name)

            cluster.fasta = file_name  # change fasta variable of the cluster

        return fasta_paths
Exemplo n.º 3
0
def convert_rep_seqs_to_longest_orf(rep_seq_fasta):
    '''
    Given a fasta file of sequnces (should be the rep sequences) returns a list
    of tuples where first item is the oringal header and second is the longest
    ORF found in that header's sequence. Can then be passed off to the tblastn
    search method.
    '''
    rep_proteins = []

    for rep_header, rep_seq in read_as_tuples(rep_seq_fasta):
        rep_proteins.append(
            (rep_header, find_longest_orfs(six_frames(rep_seq))))

    write_from_tuple_list(rep_proteins, rep_seq_fasta)
    return rep_proteins  # returns as a list of tuples
Exemplo n.º 4
0
def get_rep_sequence(fasta_file_path, clstr_file_path):
    '''
    Given a path to a fasta file and the clstr file produced by cd hit for the
    fasta file returns the header and sequence of the representative sequence
    identified by cd-hit as a tuple.
    '''
    fasta_files = read_as_tuples(fasta_file_path)
    target = format_rep_header(get_largest_cluster_rep(clstr_file_path))
    # pulls out a name that looks like >name=RLC_Gmr2_Gm1-29 which will be
    # the first thing in the header of the seq in the fasta file we want to
    # match to
    for header, seq in fasta_files:
        name = header.split(' ')[0]
        # gives just the name of the element
        # assuming standard soybase fasta header format
        if name == target:
            return header, seq
    return False  # no match found uh oh :(
Exemplo n.º 5
0
    def get_cluster_elements(self):
        # need to search for elements in the fasta file
        # going to assume that headers is always made to the first space
        fasta = self.parent_file.split('.')[0]
        # fasta duplicate always made by cd hit which has the same
        # file name but no extension as the clstr file
        try:
            lines = []
            fasta_elements = []
            search_dict = {}

            lines = read_as_tuples(self.parent_file.split('.')
                                   [0])  # read as tuples opens the file

            for element_tuple in lines:
                search_dict[element_tuple[0].split(' ')[0]] = element_tuple

        except FileNotFoundError as e:
            return e