def handle(self, *args, **options): chemical_names = Chemical.objects.annotate( name_lower=Lower("name")).values("name_lower").annotate( cnt=Count("name_lower")).order_by("-cnt") filtered = [] for pair in chemical_names: if pair["cnt"] > 1: filtered.append(pair["name_lower"]) chemicals = Chemical.objects.annotate(name_lower=Lower("name")).filter( name_lower__in=filtered).order_by("name_lower") merging_chemical = None remaining_cnt = chemicals.count() name_lower = "" cnt = 0 first_cnt = 0 last_cnt = 0 with transaction.atomic(): for chemical in chemicals: remaining_time(remaining_cnt) if name_lower == chemical.name_lower: # print(merging_chemical.id, chemical.id) if merging_chemical.id < chemical.id: self.merge(merging_chemical, chemical) first_cnt += 1 else: self.merge(chemical, merging_chemical) last_cnt += 1 cnt += 1 else: name_lower = chemical.name_lower merging_chemical = chemical print("Successfully merged %s pairs of chemicals" % (cnt, ))
def handle(self, *args, **options): data = pandas.read_csv(options.get('file'), header=0, delimiter=',', quoting=csv.QUOTE_ALL) data = data.replace(np.nan, '', regex=True) pathways = data[["PathwayName", "PathwayID"]].drop_duplicates() pathways.columns = ("pathway_name", "pathway_id") pathways = pathways.to_dict('records') data = data.to_dict('records') count = len(data) with transaction.atomic(): interactions_to_create = [] DiseasePathway.objects.bulk_create([DiseasePathway( pathway_name=record["pathway_name"], pathway_id=record["pathway_id"] ) for record in pathways]) existing_pathways = dict(DiseasePathway.objects.values_list("pathway_id", "id")) existing_diseases = dict(DiseaseTrait.objects.filter(category=disease_category).values_list("ctd_id", "id")) existing_genes = dict(Gene.objects.values_list("symbol", "id")) for row in data: remaining_time(count) disease = existing_diseases.get(row["DiseaseID"]) pathway = existing_pathways.get(row["PathwayID"]) gene = existing_genes.get(row["InferenceGeneSymbol"].lower()) interactions_to_create.append( DiseasePathwayInteraction( disease_id=disease, pathway_id=pathway, gene_id=gene, gene_symbol=row["InferenceGeneSymbol"] ) ) DiseasePathwayInteraction.objects.bulk_create(interactions_to_create)
def handle(self, *args, **options): go_type = options.get('go_type') data = pandas.read_csv(options.get('file'), header=0, delimiter=',', quoting=csv.QUOTE_ALL) data = data.replace(np.nan, '', regex=True) with transaction.atomic(): existing_chemicals = dict( Chemical.objects.filter( chemical_number__isnull=False).values_list( "chemical_number", "id")) data = data.to_dict('records') count = len(data) interactions_to_save = [] for row in data: remaining_time(count) chemical = existing_chemicals.get("MESH:" + row["ChemicalID"]) if chemical: interactions_to_save.append( ChemicalPathway( chemical_id=chemical, cas_rn=row.get("CasRN"), pathway_name=row.get("PathwayName"), pathway_id=row.get("PathwayID"), p_value=row.get("PValue"), corrected_p_value=row.get("CorrectedPValue"), target_match_qty=row.get("TargetMatchQty"), target_total_qty=row.get("TargetTotalQty"), background_match_qty=row.get("BackgroundMatchQty"), background_total_qty=row.get( "BackgroundTotalQty"))) ChemicalPathway.objects.bulk_create(interactions_to_save)
def handle(self, *args, **options): chemicals = Chemical.objects.filter( Q(name__iregex=r"(.+\-|.+\ ){4,}.+") | Q(name__iregex=r".*[\d].*[\-].*")).all() obscure_category = SubstanceCategory.objects.filter( slug="obscure_chemicals").first() rest = [ record.get_family() for record in SubstanceCategory.objects.filter(slug__in=[ "popular-drugs", "natural-treatments", "beneficial-substances", "important-natural-compounds", "gras", "chemical_of_bilological_interest" ]).all() ] rest = list(itertools.chain(*rest)) chemicals = list(set(chemicals)) count = len(chemicals) with transaction.atomic(): for chemical in chemicals: remaining_time(count) chemical.categories.add(obscure_category) chemical.categories.remove(*rest) bio_interest = SubstanceCategory.objects.filter( slug="chemical_of_bilological_interest").first() chemicals = Chemical.objects.exclude( categories__in=obscure_category.get_family()) count = len(chemicals) with transaction.atomic(): for chemical in chemicals: remaining_time(count) chemical.categories.add(bio_interest)
def handle(self, *args, **options): healtheffects = HealthEffect.objects.all() count = HealthEffect.objects.count() for effect in healtheffects: remaining_time(count) effect.slug = slugify(effect.name) effect.save()
def handle(self, *args, **options): organisms = Organism.objects.all() count = organisms.count() with transaction.atomic(): for organism in organisms: remaining_time(count) # if not organism.slug: organism.slug = slugify("-".join( [str(organism.id), organism.latin_name])) organism.save()
def handle(self, *args, **options): chemical_pathways = ChemicalPathway.objects.all() pathways = dict(Pathway.objects.values_list("pathway_id", "id")) count = chemical_pathways.count() with transaction.atomic(): for chemical_pathway in chemical_pathways: remaining_time(count) related_pathway_id = pathways.get(chemical_pathway.pathway_id) if related_pathway_id: chemical_pathway.related_pathway_id = related_pathway_id chemical_pathway.save() print("Done")
def handle(self, *args, **options): headers = { "GeneSymbol": str, "GeneName": str, "GeneID": str, "AltGeneIDs": str, "Synonyms": str, "BioGRIDIDs": str, "PharmGKBIDs": str, "UniprotIDs": str } data = pandas.read_csv(options.get('file'), header=0, delimiter=',', quoting=csv.QUOTE_ALL, dtype=headers) data.columns = [ "gene_symbol", "gene_name", "gene_id", "alt_gene_id", "synonyms", "bio_grid_ids", "pharm_gkbid_ids", "uniprot_ids" ] data = data.replace(np.nan, '', regex=True) # data = data[["gene_symbol", "gene_name", "synonyms"]] existing_genes = Gene.objects.all() count = existing_genes.count() data.gene_symbol = data.gene_symbol.str.lower().str.strip() with transaction.atomic(): for gene in existing_genes: remaining_time(count) row = data.ix[data.gene_symbol == gene.symbol] if row.index.size == 0: row = data.synonyms.str.contains(gene.name) row = row[row == True] if row.index.size != 0: row = data.ix[row.index] else: continue # synonyms_to_add = [] # if row.synonyms[row.index[0]] and str(row.synonyms[row.index[0]]) != 'nan': # for synonym in row.synonyms[row.index[0]].split("|"): # synonyms_to_add.append(Synonym.objects.get_or_create(name=synonym)[0]) # synonyms_to_add.append(Synonym.objects.get_or_create(name=gene.name)[0]) # gene.synonyms.add(*synonyms_to_add) gene.ctd_id = row.gene_id[row.index[0]] gene.ctd_alt_gene_ids = row.alt_gene_id[row.index[0]] gene.ctd_bio_grid_ids = row.bio_grid_ids[row.index[0]] gene.ctd_pharm_gkb_ids = row.pharm_gkbid_ids[row.index[0]] gene.ctd_uniprot_ids = row.uniprot_ids[row.index[0]] # gene.full_name = row.gene_name[row.index[0]] # gene.symbol = row.gene_symbol[row.index[0]] gene.save()
def handle(self, *args, **options): existing_interactions = ChemicalGeneInteraction.objects.all() count = existing_interactions.count() with transaction.atomic(): for interactions in chunks(existing_interactions, 10000): for interaction in interactions: remaining_time(count) if interaction.pub_med_ids: ids = interaction.pub_med_ids.split("|") refs = [ "https://www.ncbi.nlm.nih.gov/pubmed/" + record for record in ids ] refs = "\n".join(refs) interaction.references = refs interaction.save()
def create_chemical_organism_interactions(self, file_path): data = pandas.read_csv(file_path, header=0, delimiter='\t', quoting=csv.QUOTE_ALL) data = data.replace(np.nan, '', regex=True) data = data.replace('NULL', '', regex=True) data = data.to_dict('records') cnt = len(data) interactions_to_create = [] for row in data: remaining_time(cnt) preparation_id = None organism_id = None if row["rel_type"] == "organism" and row["related_item_id"]: organism_id = row["related_item_id"] if row["related_item_id"] > 566 else row["related_item_id"] - 1 if row["rel_type"] == "preparation" and row["related_item_id"]: preparation_id = row["related_item_id"] interactions_to_create.append(ChemicalConcentration( rel_type=row["rel_type"], source_compound_id=int(row["source_compound_id"] or 0), source_food_id=int(row["source_food_id"] or 0), orig_food_id=row["orig_food_id"], orig_food_common_name=row["orig_food_common_name"], orig_food_scientific_name=row["orig_food_scientific_name"], orig_food_part=row["orig_food_part"], orig_compound_id=row["orig_compound_id"], orig_compound_name=row["orig_compound_name"], conc=float(row["conc"] or 0), conc_min=float(row["conc_min"] or 0), conc_max=float(row["conc_max"] or 0), conc_unit=row["conc_unit"], citation=row["citation"], citation_type=row["citation_type"], orig_method=row["orig_method"], orig_unit_expression=row["orig_unit_expression"], ref_compound=row["ref_compound"], ref_food=row["ref_food"], compound_id=int(row["compound_id"] or 0), related_item_id=int(row["related_item_id"] or 0), preparation_id=preparation_id, organism_id=organism_id )) print("Start bulk creation") # with transaction.atomic(): for interactions in chunks(interactions_to_create, 25000): remaining_time(30) ChemicalConcentration.objects.bulk_create(interactions) print("Finished")
def handle(self, *args, **options): concentrations = ChemicalConcentration.objects.only( "orig_compound_name") chemicals = dict( Chemical.objects.annotate(name_lower=Lower('name')).values_list( 'name_lower', 'id')) cnt = concentrations.count() for chunk in chunks(concentrations, 2000): with transaction.atomic(): for concentration in chunk: remaining_time(cnt) chemical_name = concentration.orig_compound_name.lower() chemical = chemicals.get(chemical_name) if chemical: concentration.chemical.add(chemical) print("Success!")
def handle(self, *args, **options): categories = SubstanceCategory.objects.filter(slug__in=[ "natural-treatments", "beneficial-substances", "important-natural-compounds", "chemical_of_bilological_interest" ]).all() with open(options.get('file')) as f: lines = f.read().splitlines() count = len(lines) with transaction.atomic(): for line in lines: remaining_time(count) line = line.strip() if not line: continue chemicals = Chemical.objects.filter( Q(name__iexact=line) | Q(synonyms__iexact=line) | Q(synonyms__istartswith=line + "|") | Q(synonyms__iendswith="|" + line) | Q(synonyms__icontains="|" + line + "|")).all() for chemical in chemicals.all(): chemical.categories.add(*categories)
def handle(self, *args, **options): concentrations = ChemicalConcentration.objects.only( "conc", "conc_unit", "conc_max") cnt = concentrations.count() for chunk in chunks(concentrations, 2000): with transaction.atomic(): for concentration in chunk: remaining_time(cnt) if concentration.conc > 0: concentration.unified_concentration = concentration.conc * multiplier.get( concentration.conc_unit) elif concentration.conc_max: concentration.unified_concentration = concentration.conc_max * multiplier.get( concentration.conc_unit) # chemical_name = concentration.orig_compound_name.lower() # chemical = chemicals.get(chemical_name) # if chemical: # concentration.chemical.add(chemical) concentration.save() print("Success!")
def upload_chemical_disease_interactions(self, file): existing_chemicals = dict(Chemical.objects.filter(chemical_number__isnull=False).values_list("chemical_number", "id")) existing_diseases = dict(DiseaseTrait.objects.filter(ctd_id__isnull=False).values_list("ctd_id", "id")) existing_genes = dict(Gene.objects.filter(symbol__isnull=False).values_list("symbol", "id")) import os files = os.listdir(file) count = len(files)*1000000 # import pdb # pdb.set_trace() with transaction.atomic(): for part in files: data = pandas.read_csv(file + part, header=None, comment="#", delimiter=',', quoting=csv.QUOTE_ALL) data.columns = ["ChemicalName", "ChemicalID", "CasRN", "DiseaseName", "DiseaseID", "DirectEvidence", "InferenceGeneSymbol", "InferenceScore", "OmimIDs", "PubMedIDs"] data = data.replace(np.nan, '', regex=True) data = data.to_dict('records') interactions_to_create = [] for row in data: remaining_time(count) chemical = existing_chemicals.get("MESH:"+row["ChemicalID"]) disease = existing_diseases.get(row["DiseaseID"]) gene = existing_genes.get(row["InferenceGeneSymbol"].lower()) # import pdb # pdb.set_trace() if gene and disease: interactions_to_create.append( ChemicalDiseaseInteraction( chemical_id=chemical, disease_id=disease, inference_gene_id=gene, direct_evidence=row["DirectEvidence"], inference_score=float(row["InferenceScore"]) if row["InferenceScore"] else 0, omim_ids=row["OmimIDs"], pub_med_ids=row["PubMedIDs"], cas_rn=row["CasRN"] ) ) else: continue ChemicalDiseaseInteraction.objects.bulk_create(interactions_to_create)
def handle(self, *args, **options): headers = { "DiseaseName": str, "DiseaseID": str, "AltDiseaseIDs": str, "Definition": str, "ParentIDs": str, "TreeNumbers": str, "ParentTreeNumbers": str, "Synonyms": str, "SlimMapping": str } data = pandas.read_csv(options.get('file'), header=0, delimiter=',', quoting=csv.QUOTE_ALL, dtype=headers) data.columns = [ "ctd_name", "ctd_id", "ctd_alt_id", "ctd_definition", "ctd_parent_ids", "ctd_tree_numbers", "ctd_parent_tree_numbers", "ctd_synonyms", "ctd_slim_mapping" ] data = data.replace(np.nan, '', regex=True) count = len(data) with transaction.atomic(): for index, row in data.iterrows(): remaining_time(count) query = Q(name__iexact=row["ctd_name"]) for synonym in row.ctd_synonyms.split("|"): query |= Q(name__iexact=synonym) diseases = DiseaseTrait.objects.filter( category=disease_category).filter(query) if diseases.exists(): for disease in diseases: self.update_existing_disease(disease, row) else: self.create_new_disease(row)
def handle(self, *args, **options): go_type = options.get('go_type') data = pandas.read_csv(options.get('file'), header=0, delimiter=',', quoting=csv.QUOTE_ALL) data = data.replace(np.nan, '', regex=True) go_data = data[["GOName", "GOID" ]].sort("GOID").drop_duplicates().to_dict('records') # import pdb # pdb.set_trace() with transaction.atomic(): GeneOntology.objects.bulk_create([ GeneOntology(go_id=record.get("GOID"), go_name=record.get("GOName"), type=go_type) for record in go_data ]) go_objects = dict(GeneOntology.objects.values_list("go_id", "id")) disease_objects = dict( DiseaseTrait.objects.filter(ctd_id__isnull=False).values_list( "ctd_id", "id")) data = data.to_dict('records') count = len(data) interactions_to_save = [] for row in data: remaining_time(count) go = go_objects.get(row["GOID"]) disease = disease_objects.get("MESH:" + row["DiseaseID"]) if go and disease: interactions_to_save.append( DiseaseGOAssociations( disease_id=disease, gene_ontology_id=go, inference_gene=row["InferenceGeneSymbols"], inference_gene_qty=row["InferenceGeneQty"])) DiseaseGOAssociations.objects.bulk_create(interactions_to_save)