class PDB_Parser:

    CIF_PARSER = MMCIFParser(
    )  # parser object for reading in structure in CIF format

    def __init__(self, path):
        '''
            Initialize every PDB_Parser with a path to a structure-file in CIF format.
            An example file is included in the repository (7ahl.cif).
            Tip: Store the parsed structure in an object variable instead of parsing it
            again & again ...
        '''

        self.structure = MMCIFParser().get_structure('my protein', path)
        # Parse the structure once and re-use it in the functions below

    # 3.8 Chains
    def get_number_of_chains(self):
        '''
            Input:
                self: Use Biopython.PDB structure which has been stored in an object variable
            Return:
                Number ofains in this structure as integer.
        '''
        # https://www.tutorialspoint.com/biopython/biopython_pdb_module.htm

        self.model = self.structure.get_models()
        models = list(self.model)
        self.chains = list(models[0].get_chains())
        n_chains = len(self.chains)
        return n_chains

    # 3.9 Sequence
    def get_sequence(self, chainId):
        '''
            Input:
                self: Use Biopython.PDB structure which has been stored in an object variable
                chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                        depends on the specific protein and the resulting structure)
            Return:
                Return the amino acid sequence (single-letter alphabet!) of a given chain (chain_id)
                in a Biopython.PDB structure as a string.
        '''

        # get list of chain identifiers as strings
        id_list = []
        for chain_id in self.chains:
            id_list.append(str(chain_id)[-2:-1])
        #print('list of chain identifiers:')
        #print(id_list)

        chain_dic = {}  # chain_id : residue list
        for idC in id_list:
            chain_dic[idC] = 'sequence not added yet'
            # dictionary is initialized
        #print('Initialized sequence dictionary:')
        #print(chain_dic)

        # get list of residues as one string for each chain:
        for i in range(len(id_list)):

            chain_name = id_list[i]  # letter 'A', 'B', 'C' etc
            residue = list(self.chains[i].get_residues()
                           )  # list of form < Residue ALA ...> to preprocess
            res_seq = ''  # initialize the aa seq for one letter code

            for r in residue:
                r = str(r)
                three_letters = r[
                    9:12]  # three-letter code for one aa or 'HOH' for water
                if three_letters in three_letters_code:
                    one = one_letter[three_letters]
                    res_seq += one

            chain_dic[chain_name] = res_seq

        #print('filled dictionary:')
        #print(chain_dic)

        sequence = chain_dic[chainId]
        return sequence

    # 3.10 Water molecules
    def get_number_of_water_molecules(self, chain_id):
        '''
            Input:
                self: Use Biopython.PDB structure which has been stored in an object variable
                chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                        depends on the specific protein and the resulting structure)
            Return:
                Return the number of water molecules of a given chain (chain_id)
                in a Biopython.PDB structure as an integer.
        '''

        # get list of chain identifiers as strings
        id_list = []
        for c in self.chains:
            id_list.append(str(c)[-2:-1])
        # print(id_list)

        water_dic = {
        }  # key = chain identifier ex 'A', value= number of water molecules
        for i in id_list:
            water_dic[i] = 'count not added yet'
            # dictionary is initialized

        # get count of water molecules  for each chain:
        for i in range(len(id_list)):

            count = 0  # initialize counter
            idC = id_list[i]  # letter 'A', 'B', 'C' etc
            residue = list(self.chains[i].get_residues()
                           )  # list of form < Residue ALA ...> to preprocess

            for r in residue:
                r = str(r)
                r = r[9:12]  # three-letter code for one aa or 'HOH' for water
                if r == 'HOH':
                    count += 1

            water_dic[idC] = count
        #print(water_dic)

        n_waters = water_dic[chain_id]
        return n_waters

    # 3.11 C-Alpha distance
    def get_ca_distance(self, chain_id_1, index_1, chain_id_2, index_2):
        ''' 
            Input:
                self: Use Biopython.PDB structure which has been stored in an object variable
                chain_id_1 : String (usually in ['A','B', 'C' ...]. The number of chains
                                depends on the specific protein and the resulting structure)
                index_1    : index of a residue in a given chain in a Biopython.PDB structure
                chain_id_2 : String (usually in ['A','B', 'C' ...]. The number of chains
                            depends on the specific protein and the resulting structure)
                index_2    : index of a residue in a given chain in a Biopython.PDB structure
        
                chain_id_1 and index_1 describe precisely one residue in a PDB structure,
                chain_id_2 and index_2 describe the second residue.
        
            Return: 
                Return the C-alpha (!) distance between the two residues, described by 
                chain_id_1/index_1 and chain_id_2/index_2. Round the returned value via int().
            
            The reason for using two different chains as an input is that also the distance
            between residues of different chains can be interesting.
            Different chains in a PDB structure can either occur between two different proteins 
            (Heterodimers) or between different copies of the same protein (Homodimers).
        '''
        # from Bio.PDB import * # an elephant but useful

        from Bio.PDB.Polypeptide import standard_aa_names

        m = self.structure[0]  # model , but different from self.model
        chain_one = m[chain_id_1]
        chain_two = m[chain_id_2]

        # Get rid of the KeyError 'CA' problem

        def aa_residues(chain):
            aa_only = []
            for i in chain:
                if i.get_resname() in standard_aa_names:
                    aa_only.append(i)
            return aa_only

        AA_1 = aa_residues(chain_one)
        AA_2 = aa_residues(chain_two)

        # CA distance

        def calc_residue_dist(residue_one, residue_two):
            """Returns the C-alpha distance between two residues"""
            diff_vector = residue_one['CA'].coord - residue_two['CA'].coord
            return np.sqrt(np.sum(diff_vector * diff_vector))

        residue_one = AA_1[index_1]
        residue_two = AA_2[index_2]

        ca_distance = calc_residue_dist(residue_one, residue_two)
        return int(ca_distance)

    # 3.12 Contact Map
    def old_get_contact_map(self, chain_id):
        # TypeError: list indices must be integers or slices, not Residue
        '''
            Input:
                self: Use Biopython.PDB structure which has been stored in an object variable
                chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                        depends on the specific protein and the resulting structure)
            Return:
                Return a complete contact map (see description in exercise sheet) 
                for a given chain in a Biopython.PDB structure as numpy array. 
                The values in the matrix describe the c-alpha distance between all residues 
                in a chain of a Biopython.PDB structure.
                Only integer values of the distance have to be given (see below).
        '''

        length = 10  # DO NOT TOUCH

        contact_map = np.zeros((length, length),
                               dtype=np.float32)  # initialisation

        m = self.structure[0]  # model , but different from self.model

        chain = m[chain_id]  # get the chain = list of residues

        for row, residue_one in enumerate(chain):
            for col, residue_two in enumerate(chain):
                contact_map[row, col] = self.get_ca_distance(
                    chain_id, residue_one, chain_id, residue_two)

        return contact_map.astype(np.int)  # return rounded (integer) values

    # 3.12 Contact Map
    def get_contact_map(self, chain_id):
        # TypeError: list indices must be integers or slices, not Residue
        '''
            Input:
                self: Use Biopython.PDB structure which has been stored in an object variable
                chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                        depends on the specific protein and the resulting structure)
            Return:
                Return a complete contact map (see description in exercise sheet) 
                for a given chain in a Biopython.PDB structure as numpy array. 
                The values in the matrix describe the c-alpha distance between all residues 
                in a chain of a Biopython.PDB structure.
                Only integer values of the distance have to be given (see below).
        '''

        # with integer result
        def calc_residue_dist(residue_one, residue_two):
            """ Returns the C-alpha distance between two residues """
            diff_vector = residue_one['CA'].coord - residue_two['CA'].coord
            return int(np.sqrt(np.sum(diff_vector * diff_vector)))

        def calc_dist_matrix(chain_one, chain_two):
            """ Returns a matrix of C-alpha distances between two chains """
            answer = np.zeros((len(chain_one), len(chain_two)), np.float)
            for row, residue_one in enumerate(chain_one):
                for col, residue_two in enumerate(chain_two):
                    answer[row,
                           col] = calc_residue_dist(residue_one, residue_two)
            return answer

        from Bio.PDB.Polypeptide import standard_aa_names

        m = self.structure[0]  # model , but different from self.model
        chain = m[chain_id]

        # Get rid of the KeyError 'CA' problem

        def aa_residues(chain):
            aa_only = []
            for i in chain:
                if i.get_resname() in standard_aa_names:
                    aa_only.append(i)
            return aa_only

        AA = aa_residues(chain)

        # contact_map = np.zeros( (length,length), dtype=np.float32 ) # initialisation
        contact_map = calc_dist_matrix(AA, AA)

        return contact_map.astype(np.int)  # return rounded (integer) values

    # 3.13 B-Factors
    def get_bfactors(self, chain_id):
        '''
            Input:
                self: Use Biopython.PDB structure which has been stored in an object variable
                chain_id  : String (usually in ['A','B', 'C' ...]. The number of chains
                        depends on the specific protein and the resulting structure)
            Return:
                Return the B-Factors for all residues in a chain of a Biopython.PDB structure.
                The B-Factors describe the mobility of an atom or a residue.
                In a Biopython.PDB structure B-Factors are given for each atom in a residue.
                Calculate the mean B-Factor for a residue by averaging over the B-Factor 
                of all atoms in a residue.
                Sometimes B-Factors are not available for a certain residue; 
                (e.g. the residue was not resolved); insert np.nan for those cases.
            
                Finally normalize your B-Factors using Standard scores (zero mean, unit variance).
                You have to use np.nanmean, np.nanvar etc. if you have nan values in your array.
                The returned data structure has to be a numpy array rounded again to integer.
        '''

        # for model in self.structure :
        # if chain_id in model :
        # chain = model[chain_id]
        from Bio.PDB.Polypeptide import standard_aa_names

        chain = self.structure[0][chain_id]

        chain_list = chain.get_list()

        b_factors = np.zeros(len(self.get_sequence(chain_id)),
                             dtype=np.float32)

        i = 0

        while i < len(self.get_sequence(chain_id)):
            for residue in chain_list:
                count = 0.0
                if residue.get_resname() in standard_aa_names:
                    # if residue.get_resname()!= "HOH" :
                    for atom in residue:
                        count += atom.get_bfactor()
                    av = count / len(residue)
                    b_factors[i] = av
                i += 1

        b_factors_reg = np.zeros(len(self.get_sequence(chain_id)),
                                 dtype=np.float32)

        k = 0
        for i in b_factors:
            if i != np.nan:
                b_factors_reg[k] = (i - b_factors.mean()) / b_factors.std()
            else:
                b_factors_reg[k] = (i - np.nanmean()) / np.nanvar()
            count += 1

        return b_factors_reg.astype(np.int)  # return rounded (integer) values