def gather_data(): "read mmCIF files and write down a few numbers (one file -> one line)" writer = csv.writer(sys.stdout, dialect='excel-tab') writer.writerow( ['code', 'na_chains', 'vs', 'vm', 'd_min', 'date', 'group']) for path in util.get_file_paths_from_args(): block = cif.read(path).sole_block() code = cif.as_string(block.find_value('_entry.id')) na = sum('nucleotide' in t[0] for t in block.find('_entity_poly.type')) vs = block.find_value('_exptl_crystal.density_percent_sol') vm = block.find_value('_exptl_crystal.density_Matthews') d_min = block.find_value('_refine.ls_d_res_high') dep_date_tag = '_pdbx_database_status.recvd_initial_deposition_date' dep_date = parse_date(block.find_values(dep_date_tag).str(0)) group = block.find_value('_pdbx_deposit_group.group_id') writer.writerow([code, na, vs, vm, d_min, dep_date, group])
def gather_data(): "read mmCIF files and write down a few numbers (one file -> one line)" writer = csv.writer(sys.stdout, dialect='excel-tab') writer.writerow( ['code', 'na_chains', 'vs', 'vm', 'd_min', 'date', 'group']) for path in util.get_file_paths_from_args(): block = cif.read_any(path).sole_block() code = block.find_string('_entry.id') na = sum('nucleotide' in t[0] for t in block.find('_entity_poly.type')) vs = block.find_value('_exptl_crystal.density_percent_sol') vm = block.find_value('_exptl_crystal.density_Matthews') d_min = block.find_value('_refine.ls_d_res_high') dates = block.find('_database_PDB_rev.date_original') oldest_date = min(parse_date(d[0]) for d in dates if d[0] not in '?.') group = block.find_string('_pdbx_deposit_group.group_id') writer.writerow([code, na, vs, vm, d_min, oldest_date, group])
from __future__ import print_function from gemmi import cif from util import get_file_paths_from_args # Check column presence and order in the _atom_site category. # In mmCIF v5 esd _atom_site.*esd columns were removed. ESD = 'Cartn_x_esd Cartn_y_esd Cartn_z_esd occupancy_esd B_iso_or_equiv_esd ' #ESD = '' USUAL_ORDER = ('group_PDB id type_symbol label_atom_id label_alt_id ' 'label_comp_id label_asym_id label_entity_id label_seq_id ' 'pdbx_PDB_ins_code Cartn_x Cartn_y Cartn_z occupancy ' 'B_iso_or_equiv ' + ESD + 'pdbx_formal_charge ' 'auth_seq_id auth_comp_id auth_asym_id auth_atom_id ' 'pdbx_PDB_model_num') counts = {} for path in get_file_paths_from_args(): block = cif.read(path).sole_block() loop_tags = block.find_loop("_atom_site.id").get_loop().tags assert all(t.startswith("_atom_site.") for t in loop_tags) tags = ' '.join(t[11:] for t in loop_tags) if tags != USUAL_ORDER: print(tags) print(USUAL_ORDER) print(block.name, tags) counts[tags] = counts.get(tags, 0) + 1 for key, value in counts.items(): print(value, key) # Results: in v4 a few EM structures (5A9Z 5AA0 5FKI 4UDF) # had different order, with ATOM/HETATM in the middle.
def main(): for path in util.get_file_paths_from_args(): block = cif.read(path).sole_block() check_chem_comp_formula_weight(block) check_entity_formula_weight(block)