def delete(self, request, key): db = database.get_or_create(self.media) if key not in db: return Response(status.HTTP_404_NOT_FOUND) else: del db[key] return True
def post(self, request): db = database.get_or_create(self.media) for m in self.CONTENT: ID = hashlib.sha256(json.dumps(m)).hexdigest() if ID not in db: db[ID] = m return Response(status.HTTP_201_CREATED)
def set(self, name): value = request.form['value'] goto = request.form.get( 'goto', url_for('ContentManagementSystem:settings') ) setting, is_created = get_or_create(Setting, name=name) if is_created: db.session.add(setting) setting.value = value db.session.commit() return redirect(goto)
def save_settings(self): goto = request.form.get('goto', url_for('ContentManagementSystem:settings')) for name, value in request.form.items(): if name.startswith('setting['): name = name[8:-1] setting, is_created = get_or_create(Setting, name=name) if is_created: db.session.add(setting) setting.value = value db.session.commit() return redirect(goto)
def add_references_by_uniprot(data): full_uniprot, ref_type, value = data if '-' in full_uniprot: uniprot, isoform = full_uniprot.split('-') uniprot_tied_references = references.get(uniprot, None) if not uniprot_tied_references: return relevant_references = [] # select relevant references: for reference in uniprot_tied_references: if any(entry.isoform == int(isoform) for entry in reference.uniprot_entries): relevant_references.append(reference) else: uniprot_tied_references = references.get(full_uniprot, None) if not uniprot_tied_references: return relevant_references = uniprot_tied_references if ref_type == 'UniProtKB-ID': # http://www.uniprot.org/help/entry_name # "Each >reviewed< entry is assigned a unique entry name upon integration into UniProtKB/Swiss-Prot" # Entry names comes in format: X_Y; # - for Swiss-Prot entry X is a mnemonic protein identification code (at most 5 characters) # - for TrEMBL entry X is the same as accession code (6 to 10 characters) x, y = value.split('_') if len(x) <= 5: for reference in relevant_references: assert '-' not in full_uniprot entry = UniprotEntry.query.filter_by( accession=full_uniprot, reference=reference).one() entry.reviewed = True return if ref_type in ensembl_references_to_collect: attr = ensembl_references_to_collect[ref_type] for relevant_reference in relevant_references: attrs = {'reference': relevant_reference, attr: value} peptide, new = get_or_create(EnsemblPeptide, **attrs) if new: db.session.add(peptide)
def add_uniprot_accession(data): # full uniprot includes isoform (if relevant) full_uniprot, ref_type, value = data if ref_type == 'RefSeq_NT': # get protein refseq_nm = value.split('.')[0] if not refseq_nm or not refseq_nm.startswith( 'NM') or not full_uniprot: return try: protein = Protein.query.filter_by(refseq=refseq_nm).one() except NoResultFound: return try: uniprot, isoform = full_uniprot.split('-') isoform = int(isoform) except ValueError: # only one isoform ? # print('No isoform specified for', full_uniprot, refseq_nm) uniprot = full_uniprot isoform = 1 reference, new = get_or_create(ProteinReferences, protein=protein) uniprot_entry, new_uniprot = get_or_create(UniprotEntry, accession=uniprot, isoform=isoform) reference.uniprot_entries.append(uniprot_entry) references[uniprot].append(reference) if new: db.session.add(reference) if new_uniprot: db.session.add(uniprot_entry)
def __init__(self): print(f'Preparing {self.source_name} sites importer...') self.issues_counter = Counter() # caching proteins and kinases allows for much faster # import later on, though it takes some time to cache self.known_kinases = create_key_model_dict(Kinase, 'name', lowercase=True) self.known_groups = create_key_model_dict(KinaseGroup, 'name', lowercase=True) self.known_sites = create_key_model_dict( Site, ['protein_id', 'position', 'residue'], options=(joinedload(Site.sources).joinedload('*'))) self.proteins = create_key_model_dict( Protein, 'refseq', options=(load_only('refseq', 'sequence', 'id').joinedload( Protein.gene).joinedload(Gene.isoforms).load_only('refseq'))) # create site types site_type_objects = [ get_or_create(SiteType, name=name) for name in set(self.site_types) ] self.novel_site_types = [ site_type for site_type, new in site_type_objects if new ] self.site_types_map = { site_type.name: site_type for site_type, new in site_type_objects } self.source, _ = get_or_create(SiteSource, name=self.source_name) print(f'{self.source_name} importer ready.')
def parser(line): gene_name, p_value, fdr = line p_value = float(p_value) fdr = float(fdr) nonlocal to_high_fdr_count if fdr >= fdr_cutoff: to_high_fdr_count += 1 return gene, created = get_or_create(Gene, name=gene_name) entry = GeneListEntry(gene=gene, p=p_value, fdr=fdr) list_entries.append(entry)
def get_genomic_muts(self, chrom, dna_pos, dna_ref, dna_alt) -> List['SearchResult']: """Returns aminoacid mutations meeting provided criteria. There may be several mutations with the same genomic coordinates and alleles, as there are many splicing isoforms produced from a single gene. Stop codon mutations are not considered. Args: chrom: chromosome number or identifier, without 'chr' prefix dna_pos: genomic position dna_ref: reference allele dna_alt: alternative allele Returns: list of items where each item contains Mutation object and additional metadata """ from search.mutation_result import SearchResult from models import Protein, Mutation from database import get_or_create snv = make_snv_key(chrom, dna_pos, dna_ref, dna_alt) items = [decode_csv(item) for item in self[snv]] # this could be speed up by: itemgetters, accumulative queries and so on results = [] for item in items: protein = Protein.query.get(item['protein_id']) mutation, created = get_or_create( Mutation, protein=protein, protein_id=protein. id, # TODO: should use either protein or protein_id position=item['pos'], alt=item['alt']) results.append( SearchResult(protein=protein, mutation=mutation, is_mutation_novel=created, type='genomic', **item)) return results
def __setstate__(self, state): state['protein'] = Protein.query.filter_by( refseq=state['protein_refseq'] ).one() del state['protein_refseq'] state['mutation'], created = get_or_create( Mutation, protein=state['protein'], **state['mutation_kwargs'] ) del state['mutation_kwargs'] state['meta_user'].mutation = state['mutation'] state['mutation'].meta_user = state['meta_user'] self.__dict__.update(state)
def add_ncbi_mappings(data): # 9606 3329 HSPD1 NG_008915.1 NM_199440.1 NP_955472.1 reference standard taxonomy, entrez_id, gene_name, refseq_gene, lrg, refseq_nucleotide, t, refseq_peptide, p, category = data refseq_nm = refseq_nucleotide.split('.')[0] if not refseq_nm or not refseq_nm.startswith('NM'): return try: protein = Protein.query.filter_by(refseq=refseq_nm).one() except NoResultFound: return reference, new = get_or_create(ProteinReferences, protein=protein) if new: db.session.add(reference) reference.refseq_np = refseq_peptide.split('.')[0] reference.refseq_ng = refseq_gene.split('.')[0] gene = protein.gene if gene.name != gene_name: print( f'Gene name mismatch for RefSeq mappings: {gene.name} vs {gene_name}' ) entrez_id = int(entrez_id) if gene.entrez_id: if gene.entrez_id != entrez_id: print( f'Entrez ID mismatch for isoforms of {gene.name} gene: {gene.entrez_id}, {entrez_id}' ) if gene.name == gene_name: print( f'Overwriting {gene.entrez_id} entrez id with {entrez_id} for {gene.name} gene, ' f'because record with {entrez_id} has matching gene name' ) gene.entrez_id = entrez_id else: gene.entrez_id = entrez_id
def save_text_entry(self): name = request.form['entry_id'] new_content = request.form['new_content'] text_entry, created = get_or_create(TextEntry, name=name) if created: db.session.add(text_entry) status = 200 text_entry.content = new_content try: db.session.commit() except (IntegrityError, OperationalError) as e: print(e) db.session.rollback() status = 501 result = { 'status': status, 'content': substitute_variables(text_entry.content) } return jsonify(result)
def save_inline_help(self): name = request.form['entry_id'] old_content = request.form.get('old_content', None) new_content = request.form['new_content'] help_entry, created = get_or_create(HelpEntry, name=name) if created: db.session.add(help_entry) if created or help_entry.content == old_content: status = 200 help_entry.content = new_content try: db.session.commit() except (IntegrityError, OperationalError) as e: print(e) db.session.rollback() status = 501 else: status = 409 result = {'status': status, 'content': help_entry.content} return jsonify(result)
def parse(self, path): mutations = defaultdict(lambda: [0, set()]) for line in iterate_tsv_gz_file(path, file_header=self.header): cancer_name, sample_name = self.decode_line(line) if sample_name in self.samples_to_skip: continue cancer, created = get_or_create(Cancer, name=cancer_name) if created: db.session.add(cancer) for mutation_id in self.preparse_mutations(line): key = (mutation_id, cancer.id) mutations[key][0] += 1 mutations[key][1].add(sample_name) return mutations
def sites_motifs(data=None): motifs_data = [ # site_type_name, name, pattern (Python regular expression), sequences for pseudo logo # https://prosite.expasy.org/PDOC00001 [ 'N-glycosylation', 'N-linked', '.{7}N[^P][ST].{5}', [ ' ' * 7 + f'N{aa}{st}' + ' ' * 5 for aa in aa_symbols if aa != 'P' for st in 'ST' ] ], # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4721579/ [ 'N-glycosylation', 'N-linked - atypical', '.{7}N[^P][CV].{5}', [ ' ' * 7 + f'N{aa}{cv}' + ' ' * 5 for aa in aa_symbols if aa != 'P' for cv in 'CV' ] ], # Based on https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1301293/ [ 'O-glycosylation', 'O-linked TAPP', '.{7}TAPP', [' ' * 7 + 'TAPP' + ' ' * 5] ], [ 'O-glycosylation', 'O-linked TSAP', '.{7}TSAP', [' ' * 7 + 'TSAP' + ' ' * 5] ], [ 'O-glycosylation', 'O-linked TV.P', '.{7}TV.P', [' ' * 7 + 'TV.P' + ' ' * 5] ], [ 'O-glycosylation', 'O-linked [ST]P.P', '.{7}[ST]P.P', [' ' * 7 + f'{st}P P' + ' ' * 5 for st in 'ST'] ], # https://www.uniprot.org/help/carbohyd [ 'C-glycosylation', 'C-linked W..W', '.{7}W..W.{4}', [' ' * 7 + 'W W' + ' ' * 4] ], [ 'C-glycosylation', 'C-linked W..W', '.{4}W..W.{7}', [' ' * 4 + 'W W' + ' ' * 7] ], [ 'C-glycosylation', 'C-linked W[ST].C', '.{7}W[ST].C.{4}', [' ' * 7 + f'W{st} C' + ' ' * 4 for st in 'ST'] ], ] if data: motifs_data = data new_motifs = [] for site_type_name, name, pattern, sequences in motifs_data: site_type, _ = get_or_create(SiteType, name=site_type_name) motif, new = get_or_create(SiteMotif, name=name, pattern=pattern, site_type=site_type) if new: new_motifs.append(motif) db.session.add(motif) motif.generate_pseudo_logo(sequences) return new_motifs
def clinvar_parser(line): nonlocal highest_disease_id, duplicates try: at_least_one_significant_sub_entry, values, sub_entries_cnt, names, statuses, significances = self.parse_metadata( line) except MalformedRowError as e: print(str(e) + '\n' + line) return False # following 2 lines are result of issue #47 - we don't import those # clinvar mutations that do not have any diseases specified: if not at_least_one_significant_sub_entry: return for mutation_id in self.get_or_make_mutations(line): # take care of duplicates duplicated = self.look_after_duplicates( mutation_id, clinvar_mutations, values[:4]) if duplicated: duplicates += 1 continue # take care of nearly-duplicates same_mutation_pointers = self.mutations_details_pointers_grouped_by_unique_mutations[ mutation_id] assert len(same_mutation_pointers) <= 1 if same_mutation_pointers: pointer = same_mutation_pointers[0] old = self.data_as_dict(clinvar_mutations[pointer]) new = self.data_as_dict(values, mutation_id=mutation_id) if old['db_snp_ids'] != [new['db_snp_ids']]: clinvar_mutations[pointer][1].append(new['db_snp_ids']) # if either of the dbSNP entries is validated, the mutation is validated # (the same with presence in PubMed) for key in ['is_validated', 'is_in_pubmed_central']: if old[key] != new[key] and new[key]: index = self.insert_keys.index(key) clinvar_mutations[pointer][index] = True print( 'Merged details referring to the same mutation (%s): %s into %s' % (mutation_id, values, clinvar_mutations[pointer])) continue self.protect_from_duplicates(mutation_id, clinvar_mutations) # Python 3.5 makes it easy: **values (but is not available) clinvar_mutations.append([ mutation_id, [values[0]], values[1], values[2], values[3], ]) for i in range(sub_entries_cnt): name = names[i] # we don't won't _uninteresting_ data if name in ('not_specified', 'not provided'): continue if name in new_diseases: disease_id = new_diseases[name] else: disease, created = get_or_create(Disease, name=name) if created: highest_disease_id += 1 new_diseases[name] = highest_disease_id disease_id = highest_disease_id else: disease_id = disease.id clinvar_data.append(( len(clinvar_mutations), int(significances[i]) if significances is not None else None, disease_id, statuses[i] if statuses else None, ))
def external_references(path='data/HUMAN_9606_idmapping.dat.gz', refseq_lrg='data/LRG_RefSeqGene', refseq_link='data/refseq_link.tsv.gz'): from models import Protein from models import ProteinReferences from models import EnsemblPeptide from sqlalchemy.orm.exc import NoResultFound references = defaultdict(list) def add_uniprot_accession(data): # full uniprot includes isoform (if relevant) full_uniprot, ref_type, value = data if ref_type == 'RefSeq_NT': # get protein refseq_nm = value.split('.')[0] if not refseq_nm or not refseq_nm.startswith( 'NM') or not full_uniprot: return try: protein = Protein.query.filter_by(refseq=refseq_nm).one() except NoResultFound: return try: uniprot, isoform = full_uniprot.split('-') isoform = int(isoform) except ValueError: # only one isoform ? # print('No isoform specified for', full_uniprot, refseq_nm) uniprot = full_uniprot isoform = 1 reference, new = get_or_create(ProteinReferences, protein=protein) uniprot_entry, _ = get_or_create(UniprotEntry, accession=uniprot, isoform=isoform) reference.uniprot_entries.append(uniprot_entry) references[uniprot].append(reference) if new: db.session.add(reference) ensembl_references_to_collect = {'Ensembl_PRO': 'peptide_id'} def add_references_by_uniprot(data): full_uniprot, ref_type, value = data if '-' in full_uniprot: uniprot, isoform = full_uniprot.split('-') uniprot_tied_references = references.get(uniprot, None) if not uniprot_tied_references: return relevant_references = [] # select relevant references: for reference in uniprot_tied_references: if any(entry.isoform == int(isoform) for entry in reference.uniprot_entries): relevant_references.append(reference) else: uniprot_tied_references = references.get(full_uniprot, None) if not uniprot_tied_references: return relevant_references = uniprot_tied_references if ref_type == 'UniProtKB-ID': # http://www.uniprot.org/help/entry_name # "Each >reviewed< entry is assigned a unique entry name upon integration into UniProtKB/Swiss-Prot" # Entry names comes in format: X_Y; # - for Swiss-Prot entry X is a mnemonic protein identification code (at most 5 characters) # - for TrEMBL entry X is the same as accession code (6 to 10 characters) x, y = value.split('_') if len(x) <= 5: for reference in relevant_references: assert '-' not in full_uniprot entry = UniprotEntry.query.filter_by( accession=full_uniprot, reference=reference).one() entry.reviewed = True return if ref_type in ensembl_references_to_collect: attr = ensembl_references_to_collect[ref_type] for relevant_reference in relevant_references: attrs = {'reference': relevant_reference, attr: value} peptide, new = get_or_create(EnsemblPeptide, **attrs) if new: db.session.add(peptide) def add_ncbi_mappings(data): # 9606 3329 HSPD1 NG_008915.1 NM_199440.1 NP_955472.1 reference standard taxonomy, entrez_id, gene_name, refseq_gene, lrg, refseq_nucleotide, t, refseq_peptide, p, category = data refseq_nm = refseq_nucleotide.split('.')[0] if not refseq_nm or not refseq_nm.startswith('NM'): return try: protein = Protein.query.filter_by(refseq=refseq_nm).one() except NoResultFound: return reference, new = get_or_create(ProteinReferences, protein=protein) if new: db.session.add(reference) reference.refseq_np = refseq_peptide.split('.')[0] reference.refseq_ng = refseq_gene.split('.')[0] gene = protein.gene if gene.name != gene_name: print('Gene name mismatch for RefSeq mappings: %s vs %s' % (gene.name, gene_name)) entrez_id = int(entrez_id) if gene.entrez_id: if gene.entrez_id != entrez_id: print('Entrez ID mismatch for isoforms of %s gene: %s, %s' % (gene.name, gene.entrez_id, entrez_id)) if gene.name == gene_name: print( 'Overwriting %s entrez id with %s for %s gene, because record with %s has matching gene name' % (gene.entrez_id, entrez_id, gene.name, entrez_id)) gene.entrez_id = entrez_id else: gene.entrez_id = entrez_id parse_tsv_file(refseq_lrg, add_ncbi_mappings, file_header=[ '#tax_id', 'GeneID', 'Symbol', 'RSG', 'LRG', 'RNA', 't', 'Protein', 'p', 'Category' ]) # add mappings retrieved from UCSC tables for completeness header = [ '#name', 'product', 'mrnaAcc', 'protAcc', 'geneName', 'prodName', 'locusLinkId', 'omimId' ] for line in iterate_tsv_gz_file(refseq_link, header): gene_name, protein_full_name, refseq_nm, refseq_peptide, _, _, entrez_id, omim_id = line if not refseq_nm or not refseq_nm.startswith('NM'): continue try: protein = Protein.query.filter_by(refseq=refseq_nm).one() except NoResultFound: continue gene = protein.gene if gene.name != gene_name: print('Gene name mismatch for RefSeq mappings: %s vs %s' % (gene.name, gene_name)) entrez_id = int(entrez_id) if protein_full_name: if protein.full_name: if protein.full_name != protein_full_name: print( 'Protein full name mismatch: %s vs %s for %s' % (protein.full_name, protein_full_name, protein.refseq)) continue protein.full_name = protein_full_name if gene.entrez_id: if gene.entrez_id != entrez_id: print('Entrez ID mismatch for isoforms of %s gene: %s, %s' % (gene.name, gene.entrez_id, entrez_id)) if gene.name == gene_name: print( 'Overwriting %s entrez id with %s for %s gene, because record with %s has matching gene name' % (gene.entrez_id, entrez_id, gene.name, entrez_id)) gene.entrez_id = entrez_id else: gene.entrez_id = entrez_id if refseq_peptide: reference, new = get_or_create(ProteinReferences, protein=protein) if new: db.session.add(reference) if reference.refseq_np and reference.refseq_np != refseq_peptide: print( 'Refseq peptide mismatch between LRG and UCSC retrieved data: %s vs %s for %s' % (reference.refseq_np, refseq_peptide, protein.refseq)) reference.refseq_np = refseq_peptide parse_tsv_file(path, add_uniprot_accession, file_opener=gzip.open, mode='rt') parse_tsv_file(path, add_references_by_uniprot, file_opener=gzip.open, mode='rt') return [ reference for reference_group in references.values() for reference in reference_group ]
def insert_data(db): # "Board Code (200 identifies HR within the department)","License Type Code (see tables below)","Licensee Name","Rank Code","Modifier Code (see tables below)","Mailing Name (if different from Licensee Name)","Mailing Street Address Line 1","Mailing Address Line 2","Mailing Address Line 3","Mailing City","Mailing State","Mailing Zip Code","Mailing County Code (see table below)","Primary Telephone Number", # "Business Name (Location)",Filler,"Location Street Address Line 1","Location Address Line 2","Location Address Line 3","Location City","Location State","Location Zip Code","Location County Code (see table below)","Secondary Telephone Number",District,Region,"License Number","Primary Status Code (see table below)","Secondary Status Code (see table below)","Expiry Date","Last Inspection Date", # "Base Risk","Secondary Risk", # Latitude,Longitude,"Accuracy Score","Accuracy Type",Number,Street,City,State,County,Zip,Country # 20 Primary Status Code == Current ad = geo_coords.PostalAddress gc = geo_coords.GeoCoords org = organization.Organization for i in xrange(1, 8): print "hrlodge%d.csv" % i rd = csv.DictReader(open("hrlodge%d.csv" % i, "r+")) for row in rd: code = row['Rank Code'] if code == "HOTL": m = hotels.Hotel() elif code == "MOTL": m = hotels.Motel() elif code == 'BNB': m = hotels.BedAndBreakfast() elif code == "CNDO": m = hotels.Resort() elif code == "DWEL": m = hotels.Resort() elif code == "TAPT": m = hotels.Resort() else: continue m.name = row['Business Name (Location)'].decode('utf-8').encode( 'ascii', 'xmlcharrefreplace') m.license_number = row['License Number'] dt = row['Expiry Date'].split("/") m.license_expiry = datetime.date(int(dt[2]), int(dt[0]), int(dt[1])) dt = row['Last Inspection Date'].split("/") if len(dt) == 3: m.last_inspection = datetime.date(int(dt[2]), int(dt[0]), int(dt[1])) m.status_code = row['Primary Status Code (see table below)'] m.rooms = row[ 'Number of Seats (food service) or Rental Units (lodging)'] m.telephone = row['Primary Telephone Number'] m.address, isFound = get_or_create( db.session, ad, streetAddress=row['Location Street Address Line 1'], streetAddress2=row['Location Address Line 2'], streetAddress3=row['Location Address Line 3'], addressLocality=row['Location City'], addressRegion=row['Location State'], postalCode=row['Location Zip Code']) # Latitude,Longitude,"Accuracy Score","Accuracy Type",Number,Street,City,State,County,Zip,Country # XXX limit search to name and state morg, isFound = get_or_create( db.session, org, name=row['Licensee Name'].decode('utf-8').encode( 'ascii', 'xmlcharrefreplace')) if not isFound: morg.address, isFound = get_or_create( db.session, ad, streetAddress=row['Mailing Street Address Line 1'], streetAddress2=row['Mailing Address Line 2'], streetAddress3=row['Mailing Address Line 3'], addressLocality=row['Mailing City'], addressRegion=row['Mailing State'], postalCode=row['Mailing Zip Code']) #org.telephone = m.organization = morg db.session.add(m) db.session.add(morg) db.session.commit() db.session.commit()
def clinvar_parser(line): nonlocal highest_disease_id, duplicates metadata = line[20].split(';') clinvar_entry = make_metadata_ordered_dict(clinvar_keys, metadata) names, statuses, significances = ( (entry.replace('|', ',').split(',') if entry else None) for entry in ( clinvar_entry[key] for key in ('CLNDBN', 'CLNREVSTAT', 'CLNSIG') ) ) # those length should be always equal if they exists sub_entries_cnt = max( [ len(x) for x in (names, statuses, significances) if x ] or [0] ) at_least_one_significant_sub_entry = False for i in range(sub_entries_cnt): try: if names: if names[i] not in ('not_specified', 'not provided'): names[i] = self._beautify_disease_name(names[i]) at_least_one_significant_sub_entry = True if statuses and statuses[i] == 'no_criteria': statuses[i] = None except IndexError: print('Malformed row (wrong count of subentries) on %s-th entry:' % i) print(line) return False values = list(clinvar_entry.values()) # following 2 lines are result of issue #47 - we don't import those # clinvar mutations that do not have any diseases specified: if not at_least_one_significant_sub_entry: return for mutation_id in self.preparse_mutations(line): # take care of duplicates duplicated = self.look_after_duplicates(mutation_id, clinvar_mutations, values[:4]) if duplicated: duplicates += 1 continue # take care of nearly-duplicates same_mutation_pointers = self.mutations_details_pointers_grouped_by_unique_mutations[mutation_id] assert len(same_mutation_pointers) <= 1 if same_mutation_pointers: pointer = same_mutation_pointers[0] old = self.data_as_dict(clinvar_mutations[pointer]) new = self.data_as_dict(values, mutation_id=mutation_id) if old['db_snp_ids'] != [new['db_snp_ids']]: clinvar_mutations[pointer][1].append(new['db_snp_ids']) # if either of the dbSNP entries is validated, the mutation is validated # (the same with presence in PubMed) for key in ['is_validated', 'is_in_pubmed_central']: if old[key] != new[key] and new[key]: index = self.insert_keys.index(key) clinvar_mutations[pointer][index] = True print( 'Merged details referring to the same mutation (%s): %s into %s' % (mutation_id, values, clinvar_mutations[pointer]) ) continue self.protect_from_duplicates(mutation_id, clinvar_mutations) # Python 3.5 makes it easy: **values (but is not available) clinvar_mutations.append( [ mutation_id, [values[0]], values[1], values[2], values[3], ] ) for i in range(sub_entries_cnt): name = names[i] # we don't won't _uninteresting_ data if name in ('not_specified', 'not provided'): continue if name in new_diseases: disease_id = new_diseases[name] else: disease, created = get_or_create(Disease, name=name) if created: highest_disease_id += 1 new_diseases[name] = highest_disease_id disease_id = highest_disease_id else: disease_id = disease.id clinvar_data.append( ( len(clinvar_mutations), int(significances[i]) if significances is not None else None, disease_id, statuses[i] if statuses else None, ) )