def test_parse_tsv_file(tmpdir): some_tsv_text_with_header = ( 'gene id some column with spaces', 'XYZ 1 some description', 'WQT 2 some description', 'BCZ 3 some description', ) temp_file = tmpdir.join('some_tsv_file.tsv') temp_file.write('\n'.join(some_tsv_text_with_header)) file_name = str(temp_file) wrong_headers = (['gene', 'id', 'some'], ['gene', 'id']) for wrong_header in wrong_headers: with pytest.raises(parsers.ParsingError): parsers.parse_tsv_file(file_name, lambda x: x, file_header=wrong_header) # test case 1: check if import goes well wih the test data counter = 0 def parse(data): nonlocal counter counter += 1 assert counter == int(data[1]) assert data[2] == 'some description' parsers.parse_tsv_file( file_name, parse, file_header=['gene', 'id', 'some column with spaces'])
def load_sites(self, path='data/site_table.tsv'): """Load sites from given file altogether with kinases which interact with these sites - kinases already in database will be reused, unknown kinases will be created Args: path: to tab-separated-values file with sites to load Returns: list of created sites """ header = ['gene', 'position', 'residue', 'enzymes', 'pmid', 'type'] sites = [] def parser(line): refseq, position, residue, kinases_str, pmids, mod_types = line site_kinase_names = filter(bool, kinases_str.split(',')) pmids = pmids.split(',') for mod_type in mod_types.split(','): site, is_new = self.add_site(refseq, int(position), residue, mod_type, pmids, site_kinase_names) if is_new: sites.append(site) parse_tsv_file(path, parser, header) return sites
def pathways(path='data/hsapiens.pathways.NAME.gmt'): """Loads pathways from given '.gmt' file. New genes may be created and should automatically be added to the session with pathways as those have a relationship. """ known_genes = create_key_model_dict(Gene, 'name', lowercase=True) pathways = [] new_genes = [] def parser(data): """Parse GTM file with pathway descriptions. Args: data: a list of subsequent columns from a single line of GTM file For example:: ['CORUM:5419', 'HTR1A-GPR26 complex', 'GPR26', 'HTR1A'] """ gene_set_name = data[0] # Entry description can by empty entry_description = data[1].strip() entry_gene_names = [name.strip() for name in data[2:]] pathway_genes = [] for gene_name in entry_gene_names: name_lower = gene_name.lower() if name_lower in known_genes: gene = known_genes[name_lower] else: gene = Gene(name=gene_name) known_genes[name_lower] = gene new_genes.append(gene) pathway_genes.append(gene) pathway = Pathway(description=entry_description, genes=pathway_genes) if gene_set_name.startswith('GO'): pathway.gene_ontology = int(gene_set_name[3:]) elif gene_set_name.startswith('REAC'): pathway.reactome = int(gene_set_name[5:]) else: raise Exception('Unknown gene set name: "%s"' % gene_set_name) parse_tsv_file(path, parser) print(len(new_genes), 'new genes created') return pathways
def active_driver_gene_lists(lists=( ListData(name='Cancer (TCGA PanCancerAtlas)', path='data/mc3.activedriver.2017-11-28.txt', mutations_source=MC3Mutation), ListData( name='Clinical (ClinVar)', path= 'data/ActiveDriver1_result_pvalue_less_0.01_InheritedMutation-2017-02-16.txt', mutations_source=InheritedMutation)), fdr_cutoff=0.01): current_gene_lists = [ existing_list.name for existing_list in GeneList.query.all() ] gene_lists = [] for list_data in lists: if list_data.name in current_gene_lists: print('Skipping gene list %s: already present in database' % list_data.name) continue gene_list = GeneList( name=list_data.name, mutation_source_name=(list_data.mutations_source.name if list_data.mutations_source else None)) header = ['gene', 'p', 'fdr'] to_high_fdr_count = 0 list_entries = [] def parser(line): gene_name, p_value, fdr = line p_value = float(p_value) fdr = float(fdr) nonlocal to_high_fdr_count if fdr >= fdr_cutoff: to_high_fdr_count += 1 return gene, created = get_or_create(Gene, name=gene_name) entry = GeneListEntry(gene=gene, p=p_value, fdr=fdr) list_entries.append(entry) gene_list.entries = list_entries parse_tsv_file(list_data.path, parser, header) gene_lists.append(gene_list) return gene_lists
def cancers(path='data/cancer_types.txt'): print('Loading cancer data:') cancers = [] def parser(line): code, name, color = line cancer, created = get_or_create(Cancer, name=name) if created: cancers.append(cancer) cancer.code = code parse_tsv_file(path, parser) return cancers
def kinase_classification(path='data/regphos_kinome_scraped_clean.txt'): known_kinases = create_key_model_dict(Kinase, 'name', True) known_groups = create_key_model_dict(KinaseGroup, 'name', True) new_groups = [] print('Loading protein kinase groups:') header = [ 'No.', 'Kinase', 'Group', 'Family', 'Subfamily', 'Gene.Symbol', 'gene.clean', 'Description', 'group.clean' ] def parser(line): # note that the subfamily is often absent group, family, subfamily = line[2:5] # the 'gene.clean' [6] fits better to the names # of kinases used in all other data files kinase_name = line[6] # 'group.clean' is not atomic and is redundant with respect to # family and subfamily. This check assures that in case of a change # the maintainer would be able to spot the inconsistency easily clean = family + '_' + subfamily if subfamily else family assert line[8] == clean if kinase_name.lower() not in known_kinases: kinase = Kinase(name=kinase_name, protein=get_preferred_gene_isoform(kinase_name)) known_kinases[kinase_name.lower()] = kinase # the 'family' corresponds to 'group' in the all other files if family.lower() not in known_groups: group = KinaseGroup(name=family) known_groups[family.lower()] = group new_groups.append(group) known_groups[family.lower()].kinases.append( known_kinases[kinase_name.lower()]) parse_tsv_file(path, parser, header) return new_groups
def sites(path='data/site_table.tsv'): """Load sites from given file altogether with kinases which interact with these sites - kinases already in database will be reused, unknown kinases will be created Args: path: to tab-separated-values file with sites to load Returns: list of created sites """ proteins = get_proteins() print('Loading protein sites:') header = ['gene', 'position', 'residue', 'enzymes', 'pmid', 'type'] sites = [] known_kinases = create_key_model_dict(Kinase, 'name') known_groups = create_key_model_dict(KinaseGroup, 'name') def parser(line): refseq, position, residue, kinases_str, pmid, mod_type = line site_kinase_names = filter(bool, kinases_str.split(',')) site_kinases, site_groups = get_or_create_kinases( site_kinase_names, known_kinases, known_groups) site = Site(position=int(position), residue=residue, pmid=pmid, protein=proteins[refseq], kinases=list(site_kinases), kinase_groups=list(site_groups), type=mod_type) sites.append(site) parse_tsv_file(path, parser, header) return sites
def parse(self, path): esp_mutations = [] duplicates = 0 skipped = 0 def esp_parser(line): nonlocal duplicates, skipped metadata = line[20].split(';') # not flexible way to select MAF from metadata, but quite quick assert metadata[4].startswith('MAF=') maf_ea, maf_aa, maf_all = map(float, metadata[4][4:].split(',')) if maf_all == 0: skipped += 1 return for mutation_id in self.preparse_mutations(line): values = (mutation_id, maf_ea, maf_aa, maf_all) duplicated = self.look_after_duplicates( mutation_id, esp_mutations, values) if duplicated: duplicates += 1 continue self.protect_from_duplicates(mutation_id, esp_mutations) esp_mutations.append(values) parse_tsv_file(path, esp_parser, self.header, file_opener=gzip_open_text) print('%s duplicates found' % duplicates) print('%s zero-frequency mutations skipped' % skipped) return esp_mutations
def kinase_mappings(path='data/curated_kinase_IDs.txt'): """Create kinases from `kinase_name gene_name` mappings. For each kinase a `preferred isoforms` of given gene will be used. If given kinase already is in the database and has an isoform associated, the association will be superseded with the new one. Returns: list of created isoforms """ known_kinases = create_key_model_dict(Kinase, 'name') new_kinases = [] def parser(line): kinase_name, gene_name = line protein = get_preferred_gene_isoform(gene_name) if not protein: print('No isoform for %s kinase mapped to %s gene!' % (kinase_name, gene_name)) return if kinase_name in known_kinases: kinase = known_kinases[kinase_name] if kinase.protein and kinase.protein != protein: print('Overriding kinase-protein association for ' '%s kinase. Old isoform: %s; new isoform: %s.' % (kinase_name, kinase.protein.refseq, protein.refseq)) kinase.protein = protein else: new_kinases.append(Kinase(name=kinase_name, protein=protein)) parse_tsv_file(path, parser) return new_kinases
def drugbank(path='data/drugbank/drugbank.tsv'): drugs = set() # in case we need to query drugbank, it's better to keep names comapt. drug_type_map = { 'BiotechDrug': 'biotech', 'SmallMoleculeDrug': 'small molecule' } def parser(data): drug_id, gene_name, drug_name, drug_groups, drug_type_name = data target_gene = Gene.query.filter_by(name=gene_name).first() if target_gene: drug, created = get_or_create(Drug, name=drug_name) if created: drugs.add(drug) drug.target_genes.append(target_gene) drug.drug_bank_id = drug_id for drug_group_name in drug_groups.split(';'): if drug_group_name != 'NA': drug_group, created = get_or_create(DrugGroup, name=drug_group_name) drug.groups.add(drug_group) if drug_type_name != 'NA': drug_type, created = get_or_create( DrugType, name=drug_type_map[drug_type_name]) drug.type = drug_type # TODO: the header has type and group swapped header = 'DRUG_id GENE_symbol DRUG_name DRUG_type DRUG_group'.split('\t') parse_tsv_file(path, parser, header) return drugs
def parse(self, path): mimps = [] def parser(line): nonlocal mimps refseq = line[0] mut = line[1] psite_pos = line[2] try: protein = self.proteins[refseq] except KeyError: return ref, pos, alt = decode_raw_mutation(mut) try: assert ref == protein.sequence[pos - 1] except (AssertionError, IndexError): self.broken_seq[refseq].append((protein.id, alt)) return assert line[13] in ('gain', 'loss') # MIMP mutations are always hardcoded PTM mutations mutation_id = self.get_or_make_mutation(pos, protein.id, alt, True) psite_pos = int(psite_pos) affected_sites = [ site for site in protein.sites if site.position == psite_pos ] if len(affected_sites) != 1: warning = UserWarning( 'Skipping %s: %s%s%s (for site at position %s): ' % ( refseq, ref, pos, alt, psite_pos ) + 'MIMP site does not match to the database - ' + ( 'too many (%s) sites found.' % len(affected_sites) if affected_sites else 'given site not found.' ) ) print(warning) warn(warning) return site_id = affected_sites[0].id mimps.append( ( mutation_id, int(line[3]), 1 if line[13] == 'gain' else 0, line[9], line[10], float(line[12]), site_id ) ) parse_tsv_file(path, parser, self.header) return mimps
def proteins_and_genes(path='data/protein_data.tsv'): """Create proteins and genes based on data in a given file. If protein/gene already exists it will be skipped. Returns: list of created (new) proteins """ # TODO where does the tsv file come from? print('Creating proteins and genes:') genes = create_key_model_dict(Gene, 'name', lowercase=True) known_proteins = get_proteins() proteins = {} coordinates_to_save = [('txStart', 'tx_start'), ('txEnd', 'tx_end'), ('cdsStart', 'cds_start'), ('cdsEnd', 'cds_end')] allowed_strands = ['+', '-'] # a list storing refseq ids which occur at least twice in the file with_duplicates = [] potentially_empty_genes = set() header = [ 'bin', 'name', 'chrom', 'strand', 'txStart', 'txEnd', 'cdsStart', 'cdsEnd', 'exonCount', 'exonStarts', 'exonEnds', 'score', 'name2', 'cdsStartStat', 'cdsEndStat', 'exonFrames' ] columns = tuple(header.index(x[0]) for x in coordinates_to_save) coordinates_names = [x[1] for x in coordinates_to_save] def parser(line): # use name2 (fourth column from the end) name = line[-4] strand = line[3] assert strand in allowed_strands gene_data = { 'name': name, 'chrom': line[2][3:], # remove chr prefix 'strand': True if strand == '+' else False } if name.lower() not in genes: gene = Gene(**gene_data) genes[name.lower()] = gene else: gene = genes[name.lower()] for key, value in gene_data.items(): previous = getattr(gene, key) if previous != value: print('Replacing %s %s with %s (previously: %s)' % (gene, key, value, previous)) setattr(gene, key, value) # load protein refseq = line[1] # if protein is already in database no action is required if refseq in known_proteins: return # do not allow duplicates if refseq in proteins: with_duplicates.append(refseq) potentially_empty_genes.add(gene) """ if gene.chrom in ('X', 'Y'): # close an eye for pseudoautosomal regions print( 'Skipping duplicated entry (probably belonging', 'to pseudoautosomal region) with refseq:', refseq ) else: # warn about other duplicated records print( 'Skipping duplicated entry with refseq:', refseq ) """ return # from this line there is no processing of duplicates allowed assert refseq not in proteins protein_data = {'refseq': refseq, 'gene': gene} coordinates = zip( coordinates_names, [int(value) for i, value in enumerate(line) if i in columns]) protein_data.update(coordinates) proteins[refseq] = Protein(**protein_data) parse_tsv_file(path, parser, header) cnt = sum(map(lambda g: len(g.isoforms) == 1, potentially_empty_genes)) print('Duplicated that are only isoforms for gene:', cnt) print('Duplicated rows detected:', len(with_duplicates)) return proteins.values()
def domains(path='data/biomart_protein_domains_20072016.txt'): proteins = get_proteins() print('Loading domains:') interpro_domains = create_key_model_dict(InterproDomain, 'accession') new_domains = [] skipped = 0 wrong_length = 0 not_matching_chrom = [] header = [ 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Ensembl Protein ID', 'Chromosome Name', 'Gene Start (bp)', 'Gene End (bp)', 'RefSeq mRNA [e.g. NM_001195597]', 'Interpro ID', 'Interpro Short Description', 'Interpro Description', 'Interpro end', 'Interpro start' ] def parser(line): nonlocal skipped, wrong_length, not_matching_chrom try: protein = proteins[line[6]] # by refseq except KeyError: skipped += 1 return # If there is no data about the domains, skip this record if len(line) == 7: return try: assert len(line) == 12 except AssertionError: print(line, len(line)) # does start is lower than end? assert int(line[11]) < int(line[10]) accession = line[7] # according to: # http://www.ncbi.nlm.nih.gov/pmc/articles/PMC29841/#__sec2title assert accession.startswith('IPR') start, end = int(line[11]), int(line[10]) # TODO: the assertion fails for some domains: what to do? # assert end <= protein.length if end > protein.length: wrong_length += 1 if line[3] != protein.gene.chrom: skipped += 1 not_matching_chrom.append(line) return if accession not in interpro_domains: interpro = InterproDomain( accession=line[7], # Interpro Accession short_description=line[8], # Interpro Short Description description=line[9], # Interpro Description ) interpro_domains[accession] = interpro interpro = interpro_domains[accession] similar_domains = [ # select similar domain occurrences with criteria being: domain for domain in protein.domains # - the same interpro id if domain.interpro == interpro and # - at least 75% of common coverage for shorter occurrence of domain ((min(domain.end, end) - max(domain.start, start)) / min(len(domain), end - start) > 0.75) ] if similar_domains: try: assert len(similar_domains) == 1 except AssertionError: print(similar_domains) domain = similar_domains[0] domain.start = min(domain.start, start) domain.end = max(domain.end, end) else: domain = Domain(interpro=interpro, protein=protein, start=start, end=end) new_domains.append(domain) parse_tsv_file(path, parser, header) print('Domains loaded,', skipped, 'proteins skipped.', 'Domains exceeding proteins length:', wrong_length, 'Domains skipped due to not matching chromosomes:', len(not_matching_chrom)) return new_domains
def external_references(path='data/HUMAN_9606_idmapping.dat.gz', refseq_lrg='data/LRG_RefSeqGene', refseq_link='data/refseq_link.tsv.gz'): from models import Protein from models import ProteinReferences from models import EnsemblPeptide from sqlalchemy.orm.exc import NoResultFound references = defaultdict(list) def add_uniprot_accession(data): # full uniprot includes isoform (if relevant) full_uniprot, ref_type, value = data if ref_type == 'RefSeq_NT': # get protein refseq_nm = value.split('.')[0] if not refseq_nm or not refseq_nm.startswith( 'NM') or not full_uniprot: return try: protein = Protein.query.filter_by(refseq=refseq_nm).one() except NoResultFound: return try: uniprot, isoform = full_uniprot.split('-') isoform = int(isoform) except ValueError: # only one isoform ? # print('No isoform specified for', full_uniprot, refseq_nm) uniprot = full_uniprot isoform = 1 reference, new = get_or_create(ProteinReferences, protein=protein) uniprot_entry, _ = get_or_create(UniprotEntry, accession=uniprot, isoform=isoform) reference.uniprot_entries.append(uniprot_entry) references[uniprot].append(reference) if new: db.session.add(reference) ensembl_references_to_collect = {'Ensembl_PRO': 'peptide_id'} def add_references_by_uniprot(data): full_uniprot, ref_type, value = data if '-' in full_uniprot: uniprot, isoform = full_uniprot.split('-') uniprot_tied_references = references.get(uniprot, None) if not uniprot_tied_references: return relevant_references = [] # select relevant references: for reference in uniprot_tied_references: if any(entry.isoform == int(isoform) for entry in reference.uniprot_entries): relevant_references.append(reference) else: uniprot_tied_references = references.get(full_uniprot, None) if not uniprot_tied_references: return relevant_references = uniprot_tied_references if ref_type == 'UniProtKB-ID': # http://www.uniprot.org/help/entry_name # "Each >reviewed< entry is assigned a unique entry name upon integration into UniProtKB/Swiss-Prot" # Entry names comes in format: X_Y; # - for Swiss-Prot entry X is a mnemonic protein identification code (at most 5 characters) # - for TrEMBL entry X is the same as accession code (6 to 10 characters) x, y = value.split('_') if len(x) <= 5: for reference in relevant_references: assert '-' not in full_uniprot entry = UniprotEntry.query.filter_by( accession=full_uniprot, reference=reference).one() entry.reviewed = True return if ref_type in ensembl_references_to_collect: attr = ensembl_references_to_collect[ref_type] for relevant_reference in relevant_references: attrs = {'reference': relevant_reference, attr: value} peptide, new = get_or_create(EnsemblPeptide, **attrs) if new: db.session.add(peptide) def add_ncbi_mappings(data): # 9606 3329 HSPD1 NG_008915.1 NM_199440.1 NP_955472.1 reference standard taxonomy, entrez_id, gene_name, refseq_gene, lrg, refseq_nucleotide, t, refseq_peptide, p, category = data refseq_nm = refseq_nucleotide.split('.')[0] if not refseq_nm or not refseq_nm.startswith('NM'): return try: protein = Protein.query.filter_by(refseq=refseq_nm).one() except NoResultFound: return reference, new = get_or_create(ProteinReferences, protein=protein) if new: db.session.add(reference) reference.refseq_np = refseq_peptide.split('.')[0] reference.refseq_ng = refseq_gene.split('.')[0] gene = protein.gene if gene.name != gene_name: print('Gene name mismatch for RefSeq mappings: %s vs %s' % (gene.name, gene_name)) entrez_id = int(entrez_id) if gene.entrez_id: if gene.entrez_id != entrez_id: print('Entrez ID mismatch for isoforms of %s gene: %s, %s' % (gene.name, gene.entrez_id, entrez_id)) if gene.name == gene_name: print( 'Overwriting %s entrez id with %s for %s gene, because record with %s has matching gene name' % (gene.entrez_id, entrez_id, gene.name, entrez_id)) gene.entrez_id = entrez_id else: gene.entrez_id = entrez_id parse_tsv_file(refseq_lrg, add_ncbi_mappings, file_header=[ '#tax_id', 'GeneID', 'Symbol', 'RSG', 'LRG', 'RNA', 't', 'Protein', 'p', 'Category' ]) # add mappings retrieved from UCSC tables for completeness header = [ '#name', 'product', 'mrnaAcc', 'protAcc', 'geneName', 'prodName', 'locusLinkId', 'omimId' ] for line in iterate_tsv_gz_file(refseq_link, header): gene_name, protein_full_name, refseq_nm, refseq_peptide, _, _, entrez_id, omim_id = line if not refseq_nm or not refseq_nm.startswith('NM'): continue try: protein = Protein.query.filter_by(refseq=refseq_nm).one() except NoResultFound: continue gene = protein.gene if gene.name != gene_name: print('Gene name mismatch for RefSeq mappings: %s vs %s' % (gene.name, gene_name)) entrez_id = int(entrez_id) if protein_full_name: if protein.full_name: if protein.full_name != protein_full_name: print( 'Protein full name mismatch: %s vs %s for %s' % (protein.full_name, protein_full_name, protein.refseq)) continue protein.full_name = protein_full_name if gene.entrez_id: if gene.entrez_id != entrez_id: print('Entrez ID mismatch for isoforms of %s gene: %s, %s' % (gene.name, gene.entrez_id, entrez_id)) if gene.name == gene_name: print( 'Overwriting %s entrez id with %s for %s gene, because record with %s has matching gene name' % (gene.entrez_id, entrez_id, gene.name, entrez_id)) gene.entrez_id = entrez_id else: gene.entrez_id = entrez_id if refseq_peptide: reference, new = get_or_create(ProteinReferences, protein=protein) if new: db.session.add(reference) if reference.refseq_np and reference.refseq_np != refseq_peptide: print( 'Refseq peptide mismatch between LRG and UCSC retrieved data: %s vs %s for %s' % (reference.refseq_np, refseq_peptide, protein.refseq)) reference.refseq_np = refseq_peptide parse_tsv_file(path, add_uniprot_accession, file_opener=gzip.open, mode='rt') parse_tsv_file(path, add_references_by_uniprot, file_opener=gzip.open, mode='rt') return [ reference for reference_group in references.values() for reference in reference_group ]
def parse(self, path): clinvar_mutations = [] clinvar_data = [] duplicates = 0 new_diseases = OrderedDict() clinvar_keys = ( 'RS', 'MUT', 'VLD', 'PMC', 'CLNSIG', 'CLNDBN', 'CLNREVSTAT', ) highest_disease_id = get_highest_id(Disease) def clinvar_parser(line): nonlocal highest_disease_id, duplicates metadata = line[20].split(';') clinvar_entry = make_metadata_ordered_dict(clinvar_keys, metadata) names, statuses, significances = ( (entry.replace('|', ',').split(',') if entry else None) for entry in ( clinvar_entry[key] for key in ('CLNDBN', 'CLNREVSTAT', 'CLNSIG') ) ) # those length should be always equal if they exists sub_entries_cnt = max( [ len(x) for x in (names, statuses, significances) if x ] or [0] ) at_least_one_significant_sub_entry = False for i in range(sub_entries_cnt): try: if names: if names[i] not in ('not_specified', 'not provided'): names[i] = self._beautify_disease_name(names[i]) at_least_one_significant_sub_entry = True if statuses and statuses[i] == 'no_criteria': statuses[i] = None except IndexError: print('Malformed row (wrong count of subentries) on %s-th entry:' % i) print(line) return False values = list(clinvar_entry.values()) # following 2 lines are result of issue #47 - we don't import those # clinvar mutations that do not have any diseases specified: if not at_least_one_significant_sub_entry: return for mutation_id in self.preparse_mutations(line): # take care of duplicates duplicated = self.look_after_duplicates(mutation_id, clinvar_mutations, values[:4]) if duplicated: duplicates += 1 continue # take care of nearly-duplicates same_mutation_pointers = self.mutations_details_pointers_grouped_by_unique_mutations[mutation_id] assert len(same_mutation_pointers) <= 1 if same_mutation_pointers: pointer = same_mutation_pointers[0] old = self.data_as_dict(clinvar_mutations[pointer]) new = self.data_as_dict(values, mutation_id=mutation_id) if old['db_snp_ids'] != [new['db_snp_ids']]: clinvar_mutations[pointer][1].append(new['db_snp_ids']) # if either of the dbSNP entries is validated, the mutation is validated # (the same with presence in PubMed) for key in ['is_validated', 'is_in_pubmed_central']: if old[key] != new[key] and new[key]: index = self.insert_keys.index(key) clinvar_mutations[pointer][index] = True print( 'Merged details referring to the same mutation (%s): %s into %s' % (mutation_id, values, clinvar_mutations[pointer]) ) continue self.protect_from_duplicates(mutation_id, clinvar_mutations) # Python 3.5 makes it easy: **values (but is not available) clinvar_mutations.append( [ mutation_id, [values[0]], values[1], values[2], values[3], ] ) for i in range(sub_entries_cnt): name = names[i] # we don't won't _uninteresting_ data if name in ('not_specified', 'not provided'): continue if name in new_diseases: disease_id = new_diseases[name] else: disease, created = get_or_create(Disease, name=name) if created: highest_disease_id += 1 new_diseases[name] = highest_disease_id disease_id = highest_disease_id else: disease_id = disease.id clinvar_data.append( ( len(clinvar_mutations), int(significances[i]) if significances is not None else None, disease_id, statuses[i] if statuses else None, ) ) parse_tsv_file( path, clinvar_parser, self.header, file_opener=gzip_open_text ) print('%s duplicates found' % duplicates) return clinvar_mutations, clinvar_data, new_diseases.keys()