def families(params='published params.csv', sanity_file=None):
    '''Return a dictionary of all families in the dataset.'''
    # Map PDBIDs to paths of structures
    stru_path_list = glob.glob('structures/aligned_*.pdb')
    match_str = r'structures[/\\]aligned_(....)\.pdb'
    pdbids = [re.match(match_str, path).group(1) \
              for path in stru_path_list]
    stru_path = CIDict(zip(pdbids, stru_path_list))
    
    # Map PDBIDS to paths of multiple sequence alignments
    msa_path_list = glob.glob('gonnet aligned/* with *.clu')
    match_str = r'gonnet aligned[/\\](....) with .*\.clu'
    pdbids = [re.match(match_str, path).group(1) for path in msa_path_list]
    msa_path = CIDict(zip(pdbids, msa_path_list))
    
    # Map PDBIDs to names of sequences of the structure
    template_id = CIDict((pdbid, 'template_' + pdbid.upper())
                          for pdbid in msa_path.keys())
    
    if sanity_file is not None:
        with open(sanity_file, 'w') as f:
            f.write(repr(stru_path))
            f.write('\n')
            f.write(repr(msa_path))
            f.write('\n')
            f.write(repr(template_id))
    # Create the families
    return CIDict((pdbid, Family(pdbid, stru_path[pdbid], msa_path[pdbid],
                          template_id[pdbid], params))
                   for pdbid in msa_path.keys())
def create_session(workingdir, load = True):
    # Load structures:
    groupdict = CIDict(groups_from_folder(workingdir + '/structures',
                                   ['aligned_(.*).pdb'], load = load))
                                   
    # Remove structures not in the datset
    included_proteins = set()
    for filename in os.listdir('non ppi residues'):
        match = re.match('(\d...)\.csv', filename) 
        if match is not None:
            included_proteins.add(match.group(1))
    
    for pdbid in groupdict.keys():
        if pdbid.upper() not in included_proteins:
            del groupdict[pdbid]
            cmd.delete(pdbid)

    # In case something goes wrong, so you can look at the work in progress:
    stored.groupdict = groupdict
    
    # Delete 1E54, since its interface is included in the non_ppi dataset
    cmd.delete('1E54')
    del groupdict['1E54']
    
    cs_make_selections(groupdict)    
    
    # Change from line to cartoon representation
    cmd.hide('lines','*')
    cmd.show('cartoon','*')
    
    
    
    return groupdict
print('Alignments loaded... ' + repr(oracles))

# Calculate the moments for the pdb sequences
pdb_moments = CIDict([(structure.get_id(),
                   moment(structure, resi_lists[structure.get_id()],
                          centers[structure.get_id()],
                          partial(calculator_adapter, calc),
                          oracles[structure.get_id()].pdb_sequence()))
           for structure in structures])
print('pdb moments calculated! ' + repr(pdb_moments))

# Calculate the family moments, that is, the moments for all 
# sequences in the alignments
family_moments = CIDict((pdbid, list()) for pdbid in alignments.keys())

for pdbid in family_moments.keys():
    for seq_index in range(len(oracles[pdbid].get_alignment())):
        # Calculate the moment
        family_moment = moment(structure_dict[pdbid], resi_lists[pdbid],
                               centers[pdbid],
                               partial(calculator_adapter, calc),
                               oracles[pdbid].sequence(seq_index))

        # Calculate the %identity with the pdb sequence
        pdb_sequence = oracles[pdbid].get_pdb_seq_record().seq
        sequence = oracles[pdbid].get_alignment()[seq_index].seq
        normalized_distance = matrices.compare(pdb_sequence, sequence,
                                               identity)

        seq_id = oracles[pdbid].get_alignment()[seq_index].id
# Retrieve Daniel's aligned structures
structures = CIDict()
parser = PDBParser()
with warnings.catch_warnings():
    # When importing Daniel's aligned structures, the PDBParser gives
    # warnings about "invalid or missing" b factors and occupancies
    # There are so many that if you let it display warnings it'll never
    # finish parsing
    warnings.simplefilter('ignore')
    for pdbid in ('1A0S', '1AF6'):
        structures.update({pdbid: \
                           parser.get_structure(pdbid,
                           'aligned_{}.pdb'.format(pdbid))})
z_coords = CIDict()
for pdbid in structures.keys():
    z_coords.update({pdbid: list()})
    iter_sequence = iter(sequences[pdbid])
    for residue in structures[pdbid].get_residues():
        try:
            calpha = residue.child_dict['CA']
        except KeyError:
            # HOH and other heteroatoms will not have a C-alpha
            # They will also not have a corresponding letter in the sequence
            # So, skip them
            continue

        resi = residue.get_id()[1]
        z = calpha.get_coord()[2]

        # This structure should have the same sequence as the one
class Calculator(object):
    '''
    Carries out ez-beta calculations using a set of parameters given to
    it at initialization.
    
    The set of parameters must be a spreadsheet represented as
    a list of lists, with the inner lists representing rows. The first row
    must contain the one-letter codes of each amino acid for which
    parameters
    are to be given. Underneath each letter is a column containing its
    parameters in this order:
    Curve type ('gaussian' or 'sigmoidal')
    E0/Emin
    Zmid/Zmin
    n/sigma

    Calculating pseudo-energies:
    calculate(self, resn, z): gives pseudoenergy given a one-letter or
    three-letter code for an amino acid, and a z coordinate

    The "normalize" option is vestigial - I used to calculate what fraction
    an energy is of the maximum possible energy that that kind of residue
    can have. But, that doesn't really make much sense. It's still here
    so that I can rerun my old scripts if I need to.
    '''
    
    def __init__(self, iterable, normalize = False):

        self.normalize = normalize

        self.ref = CIDict()
        colmap = CIDict()
        for column, letter in enumerate(iterable.next()):
            if letter != '':
                self.ref.update({letter: dict()})
                colmap.update({letter: column})
        curvetypes = iterable.next()
        for letter, column in colmap.items():
            self.ref[letter].update({'curve': curvetypes[column]})
        for parameter in [{'sigmoidal': 'e0', 'gaussian': 'emin'},
                          {'sigmoidal': 'zmid', 'gaussian': 'zmin'},
                          {'sigmoidal': 'n', 'gaussian': 'sigma'}]:
            paramrow = iterable.next()
            for letter in self.ref.keys():
                curvetype = self.ref[letter]['curve']
                self.ref[letter].update({parameter[curvetype]: \
                                         float(paramrow[colmap[letter]])})

    def calculate(self, resn, z):
        '''
        gives pseudoenergy given a one-letter or
        three-letter code for an amino acid, and a z coordinate
        
        Raises a NoParameters exception when you use an amino acid that
        it doesn't have parameters for; always have some way of handling
        this when you call this method!
        '''

        if len(resn) == 3:
            resn = one_letter[resn]        

        try:
            params = self.ref[resn]
        except KeyError:
            raise NoParameters('No parameters for resn ' + str(resn))
        
        if params['curve'] == 'gaussian':
            output = params['emin'] * \
                     math.exp(-1*(abs(z)-params['zmin'])**2 \
                                 /(2*params['sigma']**2))
        elif params['curve'] == 'sigmoidal':
            output = params['e0']/(1+(abs(z)/params['zmid'])**params['n'])
        
        if self.normalize:
            if params['curve'] == 'gaussian':
                output /= params['emin']
                # Normalized trends are high energy in middle of the membrane
                # for sigmoidal, high energy in the head-group region for
                # gaussian. For aromatics and small hydrophobics (anything with
                # negative E0 or Emin) these trends should be reversed
                if params['emin'] < 0:
                    output = 1 - output

            if params['curve'] == 'sigmoidal':
                output /= params['e0']
                if params['e0'] < 0:
                    output = 1 - output

        return output
        # I expect there to only be one pdbid
        assert len(pdbid_set) == 1, 'more than 1 pdbid in one spreadsheet'
        pdbid = list(pdbid_set)[0]
        weights.update({pdbid: spreadsheet})

    # selections maps pdbids to sets of resis
    selections = CIDict()
    for pdbid, spreadsheet in weights.items():
        def not_blank(string):
            return string != ''
        resis = filter(not_blank, spreadsheet.get_column('resi'))
        selections.update({pdbid: set(resis)})

    # A new global variable for looping over these proteins
    asymmetric_dataset = CIDict([(pdbid, groupdict[pdbid]) \
                                 for pdbid in weights.keys()])

    # Make the spreadsheets available through groupdict
    for pdbid, group in asymmetric_dataset.items():
        group.non_ppi = weights[pdbid]


    # Make selection
    for pdbid, selection in selections.items():
        cmd.select(pdbid.upper() + '.non_ppi', 'none')
        for resi in selection:
            cmd.select(pdbid.upper() + '.non_ppi',
                       '{0}.molecule & i. {1} | {0}.non_ppi' \
                       .format(pdbid.upper(),resi))

finally: