def _add_family_info(self, project_id, family_id, individuals): """ Add all the background info about this family We try to keep this as simple as possible - just IDs After this is run, variants are ready to be loaded """ if self.family_exists(project_id, family_id): raise Exception("Family (%s, %s) already exists" % (project_id, family_id)) for indiv_id in individuals: if not self.individual_exists(project_id, indiv_id): self.add_individual(project_id, indiv_id) family_coll_name = "family_%s_%s" % ( slugify(project_id, separator='_'), slugify(family_id, separator='_')) family = { 'project_id': project_id, 'family_id': family_id, 'individuals': individuals, 'coll_name': family_coll_name, 'status': 'loading' } family_collection = getattr(settings, self._db_name)[family_coll_name] self._index_family_collection(family_collection) getattr(settings, self._db_name).families.save(family)
def _add_family_info(self, project_id, family_id, individuals): """ Add all the background info about this family We try to keep this as simple as possible - just IDs After this is run, variants are ready to be loaded """ if self.family_exists(project_id, family_id): #raise Exception("Family (%s, %s) already exists" % (project_id, family_id)) return for indiv_id in individuals: if not self.individual_exists(project_id, indiv_id): self.add_individual(project_id, indiv_id) family_coll_name = "family_%s_%s" % (slugify( project_id, separator='_'), slugify(family_id, separator='_')) family = { 'project_id': project_id, 'family_id': family_id, 'individuals': individuals, 'coll_name': family_coll_name, 'status': 'loading' } family_collection = self._db[family_coll_name] self._index_family_collection(family_collection) self._db.families.save(family)
def write_xl_rows_to_ped(ped_filename, xl_rows): """Writes the given rows to a ped file with the given filename Args: ped_filename: output filename xl_rows: a list of tuples where each tuple has 6 elements: family_id, sample_id, paternal_id, maternal_id, sex, affected """ with open(ped_filename, 'w') as out: for i, row in enumerate(xl_rows): assert len( row ) >= 6, "Unexpected number of columns in row #%(i)s: %(row)s" % locals( ) if not any(row): continue # skip empty rows #for _id in filter(None, row[0:4]): # assert slugify(_id) == _id, "row %(i)s has unexpected characters in id: '%(_id)s'. Only a-Z0-9 and - or _ are allowed" % locals() print("%s: %s" % (i, row)) family_id, sample_id, paternal_id, maternal_id, sex, affected = row[ 0:6] sample_id = slugify(sample_id, replace_dot=True) paternal_id = slugify(paternal_id, replace_dot=True) maternal_id = slugify(maternal_id, replace_dot=True) assert family_id and sample_id, "family_id or sample_id not specified in row: %(row)s" % locals( ) paternal_id = '.' if not paternal_id else paternal_id maternal_id = '.' if not maternal_id else maternal_id if sex: if sex not in ("1", "2"): sex = {'M': '1', 'F': '2'}[sex[0].upper()] else: sex = '.' if affected is not None: if affected not in ("1", "2"): affected = { 'u': '1', 'unaffected': '1', 'no': '1', 'a': '2', 'affected': '2', 'yes': '2' }[affected.strip().lower()] else: affected = '-9' out.write('\t'.join([ family_id, sample_id, paternal_id, maternal_id, sex, affected ]) + '\n')
def handle(self, *args, **options): project_id = args[0] project = Project.objects.get(project_id=project_id) raw_family_ids = [line.strip('\n') for line in open(args[1])] for raw_id in raw_family_ids: old_slugified_id = slugify(raw_id, separator='_').lower() if Family.objects.filter(project=project, family_id=old_slugified_id).exists(): family = Family.objects.get(project=project, family_id=old_slugified_id) family.family_id = slugify(raw_id, separator='_') # set family ID to new slug repr family.save()
def set_genotypes_from_vcf_fields(vcf_fields, variant, alt_allele_pos, vcf_header_fields, genotype_meta=True, indivs_to_include=None, vcf_id_map=None): """ if variant is a basic variants, initialize its genotypes from vcf_fields vcf_header_fields is just a list of the headers in the vcf (with the # stripped of the #CHROM in the first column) vcf_id_map: dict of [ID in the VCF file] -> [Individual ID] """ num_columns = len(vcf_fields) if num_columns != len(vcf_header_fields): raise Exception("Wrong number of columns") genotypes = {} format_str = vcf_fields[8] allele_position_map = get_allele_position_map(vcf_fields[3], vcf_fields[4]) vcf_filter = vcf_fields[6].lower() formats = {} for i, item in enumerate(format_str.split(':')): if item == 'AD': formats['ad'] = i elif item == 'DP': formats['dp'] = i elif item == 'GQ': formats['gq'] = i elif item == 'PL': formats['pl'] = i if indivs_to_include: indivs_to_include = [slugify(indiv_id, separator='_', replace_dot=True) for indiv_id in indivs_to_include] for col_index in range(9, num_columns): vcf_id = slugify(vcf_header_fields[col_index], separator='_', replace_dot=True) if vcf_id_map: indiv_id = vcf_id_map.get(vcf_id, vcf_id) else: indiv_id = vcf_id if indivs_to_include and indiv_id not in indivs_to_include: continue geno_str = vcf_fields[col_index] try: if genotype_meta: genotypes[indiv_id] = get_genotype_from_str(geno_str, formats, alt_allele_pos, allele_position_map, vcf_filter=vcf_filter) else: raise Exception("genotypes without meta not implemented - need to add kwarg") except: sys.stdout.write("Could not parse genotype from string: %s with format: %s. Allele_position_map: %s" % (geno_str, format_str, allele_position_map)) raise variant.genotypes = genotypes return variant
def write_xl_rows_to_ped(ped_filename, xl_rows): """Writes the given rows to a ped file with the given filename Args: ped_filename: output filename xl_rows: a list of tuples where each tuple has 6 elements: family_id, sample_id, paternal_id, maternal_id, sex, affected """ with open(ped_filename, 'w') as out: for i, row in enumerate(xl_rows): assert len(row) >= 6, "Unexpected number of columns in row #%(i)s: %(row)s" % locals() if not any(row): continue # skip empty rows #for _id in filter(None, row[0:4]): # assert slugify(_id) == _id, "row %(i)s has unexpected characters in id: '%(_id)s'. Only a-Z0-9 and - or _ are allowed" % locals() print("%s: %s" % (i, row)) family_id, sample_id, paternal_id, maternal_id, sex, affected = row[0:6] sample_id = slugify(sample_id, replace_dot=True) paternal_id = slugify(paternal_id, replace_dot=True) maternal_id = slugify(maternal_id, replace_dot=True) family_id = family_id.split('-')[0] assert family_id, "family_id not specified in row: %(row)s" % locals() assert sample_id, "sample_id not specified in row: %(row)s" % locals() paternal_id = '.' if not paternal_id else paternal_id maternal_id = '.' if not maternal_id else maternal_id if sex: if sex not in ("1", "2"): sex = {'M': '1', 'F': '2', 'U': '0', '?': '0'}[sex[0].upper()] else: sex ='.' if affected is not None: affected = affected.lower() if affected not in ("1", "2"): if affected == "no" or affected.strip().lower().startswith("u"): affected = '1' elif affected == "yes" or affected.strip().lower().startswith("a"): affected ='2' else: raise ValueError("Unexpected value for affected: " + affected) else: affected = '-9' out.write('\t'.join([family_id, sample_id, paternal_id, maternal_id, sex, affected]) + '\n')
def handle(self, *args, **options): project_id = args[0] project = Project.objects.get(project_id=project_id) raw_family_ids = [line.strip('\n') for line in open(args[1])] for raw_id in raw_family_ids: old_slugified_id = slugify(raw_id, separator='_').lower() if Family.objects.filter(project=project, family_id=old_slugified_id).exists(): family = Family.objects.get(project=project, family_id=old_slugified_id) family.family_id = slugify( raw_id, separator='_') # set family ID to new slug repr family.save()
def get_individuals_from_fam_file(fam_file, project_id='.'): """ Returns a list of individuals from a FAM file """ individuals = [] for line in fam_file: try: # ignore these rows if line == '' or line.startswith('#'): continue fields = line.strip('\n').split('\t') indiv_id = slugify(fields[1], separator='_', replace_dot=True) family_id = slugify(fields[0], separator='_', replace_dot=True) paternal_id = slugify(fields[2], separator='_', replace_dot=True) if paternal_id == "0": paternal_id = "." maternal_id = slugify(fields[3], separator='_', replace_dot=True) if maternal_id == "0": maternal_id = "." gender = 'unknown' if fields[4] == '2' or fields[4].upper().startswith('F'): gender = 'female' elif fields[4] == '1' or fields[4].upper().startswith('M'): gender = 'male' affected_status = 'unknown' if fields[5] == '2' or fields[5].upper().startswith('A'): affected_status = 'affected' elif fields[5] == '1' or fields[5].upper().startswith('U'): affected_status = 'unaffected' except Exception as e: raise ValueError( "Couldn't parse line: %(line)s. Fields: %(fields)s. exception: %(e)s" % locals()) indiv = Individual( indiv_id, project_id=project_id, family_id=family_id, paternal_id=paternal_id, maternal_id=maternal_id, gender=gender, affected_status=affected_status, ) individuals.append(indiv) return individuals
def get_individuals_from_fam_file(fam_file, project_id='.'): """ Returns a list of individuals from a FAM file """ individuals = [] for line in fam_file: try: # ignore these rows if line == '' or line.startswith('#'): continue fields = line.strip('\n').split('\t') indiv_id = slugify(fields[1], separator='_') family_id = slugify(fields[0], separator='_') paternal_id = slugify(fields[2], separator='_') if paternal_id == "0": paternal_id = "." maternal_id = slugify(fields[3], separator='_') if maternal_id == "0": maternal_id = "." gender = 'unknown' if fields[4] == '2' or fields[4] == 'F': gender = 'female' elif fields[4] == '1' or fields[4] == 'M': gender = 'male' affected_status = 'unknown' if fields[5] == '2': affected_status = 'affected' elif fields[5] == '1': affected_status = 'unaffected' except Exception as e: raise ValueError("Couldn't parse line: %(line)s exception: %(e)s" % locals()) indiv = Individual( indiv_id, project_id=project_id, family_id=family_id, paternal_id=paternal_id, maternal_id=maternal_id, gender=gender, affected_status=affected_status, ) individuals.append(indiv) return individuals
def get_ids_from_vcf(vcf_file): """ Get the individuals in a VCF """ for _line in vcf_file: line = _line.strip('\n') if line.startswith('#CHROM'): vcf_headers = get_vcf_headers(line) return [slugify(indiv_id, separator='_', replace_dot=True) for indiv_id in vcf_headers[9:]]
def set_genotypes_from_vcf_fields(vcf_fields, variant, alt_allele_pos, vcf_header_fields, genotype_meta=True, indivs_to_include=None, vcf_id_map=None): """ if variant is a basic variants, initialize its genotypes from vcf_fields vcf_header_fields is just a list of the headers in the vcf (with the # stripped of the #CHROM in the first column) vcf_id_map: dict of [ID in the VCF file] -> [Individual ID] """ num_columns = len(vcf_fields) if num_columns != len(vcf_header_fields): raise Exception("Wrong number of columns") genotypes = {} format_str = vcf_fields[8] allele_position_map = get_allele_position_map(vcf_fields[3], vcf_fields[4]) vcf_filter = vcf_fields[6].lower() formats = {} for i, item in enumerate(format_str.split(':')): if item == 'AD': formats['ad'] = i elif item == 'DP': formats['dp'] = i elif item == 'GQ': formats['gq'] = i elif item == 'PL': formats['pl'] = i if indivs_to_include: indivs_to_include = map(slugify, indivs_to_include) for col_index in range(9, num_columns): vcf_id = slugify(vcf_header_fields[col_index], separator='_') if vcf_id_map: indiv_id = vcf_id_map.get(vcf_id, vcf_id) else: indiv_id = vcf_id if indivs_to_include and indiv_id not in indivs_to_include: continue geno_str = vcf_fields[col_index] try: if genotype_meta: genotypes[indiv_id] = get_genotype_from_str(geno_str, formats, alt_allele_pos, allele_position_map, vcf_filter=vcf_filter) else: raise Exception("genotypes without meta not implemented - need to add kwarg") except: sys.stdout.write("Could not parse genotype from string: %s with format: %s. Allele_position_map: %s" % (geno_str, format_str, allele_position_map)) raise variant.genotypes = genotypes return variant
def add_vcf_file_to_project(project, vcf_file): """ Add this VCF file to all the individuals in project that are in the VCF file """ vcf_sample_ids = set(vcf_file.sample_id_list()) vcf_id_map = {slugify(s, separator='_'): s for s in vcf_sample_ids} for individual in project.individual_set.all(): if individual.indiv_id in vcf_id_map: individual.vcf_files.add(vcf_file) if individual.indiv_id != vcf_id_map[individual.indiv_id]: individual.vcf_id = vcf_id_map[individual.indiv_id] individual.save()
def add_vcf_file_to_project(project, vcf_file): """ Add this VCF file to all the individuals in project that are in the VCF file """ vcf_sample_ids = set(vcf_file.sample_id_list()) vcf_id_map = {slugify(s, separator='_'): s for s in vcf_sample_ids} for individual in project.individual_set.all(): if individual.indiv_id in vcf_id_map: individual.vcf_files.add(vcf_file) if individual.indiv_id != vcf_id_map[individual.indiv_id]: individual.vcf_id = vcf_id_map[individual.indiv_id] individual.save()
def get_ids_from_vcf(vcf_file): """ Get the individuals in a VCF """ for _line in vcf_file: line = _line.strip('\n') if line.startswith('#CHROM'): vcf_headers = get_vcf_headers(line) return [ slugify(indiv_id, separator='_', replace_dot=True) for indiv_id in vcf_headers[9:] ]
def add_breakpoint_from_dict(project, bp): """ Add a breakpoint to the given project based on keys from the given dict. The sample id is presumed to already be loaded as an existing individual in the project. If a breakpoint already exists, it is not updated or changed (even if data loaded is actually different). Therefore to reload it is necessary to delete first, but it is safe to load new samples incrementally by just running the load again. """ # Fields in dict are chr start end sample depth cscore partner genes cdsdist xpos = genomeloc.get_xpos(bp['chr'], int(bp['start'])) sample_id = slugify(bp['sample'], separator='_') try: breakpoint = Breakpoint.objects.get(project=project, xpos=xpos, individual__indiv_id=sample_id) existing = True except Breakpoint.DoesNotExist: existing = False breakpoint = Breakpoint() breakpoint.xpos = xpos breakpoint.project = project breakpoint.obs = int(bp['depth']) breakpoint.individual = Individual.objects.get(project=project, indiv_id=sample_id) breakpoint.sample_count = int(bp['sample_count']) breakpoint.partner = bp['partner'] breakpoint.consensus = bp['cscore'] breakpoint.save() for gene_symbol, cds_dist in zip(bp['genes'].split(','), bp['cdsdist'].split(',')): if gene_symbol: if existing: try: gene = BreakpointGene.objects.get(breakpoint=breakpoint, gene_symbol=gene_symbol) except BreakpointGene.DoesNotExist: gene = BreakpointGene() else: gene = BreakpointGene() gene.breakpoint = breakpoint gene.gene_symbol = gene_symbol gene.cds_dist = int(cds_dist) gene.save()
def add_breakpoint_from_dict(project, bp ): """ Add a breakpoint to the given project based on keys from the given dict. The sample id is presumed to already be loaded as an existing individual in the project. If a breakpoint already exists, it is not updated or changed (even if data loaded is actually different). Therefore to reload it is necessary to delete first, but it is safe to load new samples incrementally by just running the load again. """ # Fields in dict are chr start end sample depth cscore partner genes cdsdist xpos = genomeloc.get_xpos(bp['chr'], int(bp['start'])) sample_id = slugify(bp['sample'], separator='_') try: breakpoint = Breakpoint.objects.get(project=project, xpos=xpos, individual__indiv_id=sample_id) existing = True except Breakpoint.DoesNotExist: existing = False breakpoint = Breakpoint() breakpoint.xpos = xpos breakpoint.project = project breakpoint.obs = int(bp['depth']) breakpoint.individual = Individual.objects.get(project=project, indiv_id=sample_id) breakpoint.sample_count = int(bp['sample_count']) breakpoint.partner = bp['partner'] breakpoint.consensus = bp['cscore'] breakpoint.save() for gene_symbol,cds_dist in zip(bp['genes'].split(','), bp['cdsdist'].split(',')): if gene_symbol: if existing: try: gene = BreakpointGene.objects.get(breakpoint=breakpoint, gene_symbol=gene_symbol) except BreakpointGene.DoesNotExist: gene = BreakpointGene() else: gene = BreakpointGene() gene.breakpoint = breakpoint gene.gene_symbol = gene_symbol gene.cds_dist = int(cds_dist) gene.save()
def handle(self, *args, **options): project_id = args[0] project = Project.objects.get(project_id=project_id) project_dir = os.path.abspath(args[1]) project_yaml_file = os.path.join(project_dir, 'project.yaml') project_spec = yaml.load(open(project_yaml_file)) # load in sample IDs that we'll use for the project sample_id_file = os.path.join(project_dir, project_spec['sample_id_list']) sample_ids = [l.strip('\n') for l in open(sample_id_file)] sample_ids = [slugify(s, separator='_') for s in sample_ids] sample_management.add_indiv_ids_to_project(project, sample_ids) # set meta info project.project_name = project_spec['project_name'] project.save() # nicknames if 'nicknames' in project_spec: # todo pass # load individuals if 'ped_files' in project_spec: for relative_path in project_spec['ped_files']: ped_file_path = os.path.join(project_dir, relative_path) sample_management.update_project_from_fam( project, open(ped_file_path)) # todo: add awesome-slugify to above # set VCF files if 'vcf_files' in project_spec: for relative_path in project_spec['vcf_files']: vcf_file_path = os.path.join(project_dir, relative_path) # todo: this should be a fn somewhere that add_vcf_to_project uses too vcf_file = VCFFile.objects.get_or_create( file_path=vcf_file_path)[0] sample_management.add_vcf_file_to_project(project, vcf_file)
def handle(self, *args, **options): project_id = args[0] project = Project.objects.get(project_id=project_id) project_dir = os.path.abspath(args[1]) project_yaml_file = os.path.join(project_dir, 'project.yaml') project_spec = yaml.load(open(project_yaml_file)) # load in sample IDs that we'll use for the project sample_id_file = os.path.join(project_dir, project_spec['sample_id_list']) sample_ids = [l.strip('\n') for l in open(sample_id_file)] sample_ids = [slugify(s, separator='_') for s in sample_ids] sample_management.add_indiv_ids_to_project(project, sample_ids) # set meta info project.project_name = project_spec['project_name'] project.save() # nicknames if 'nicknames' in project_spec: # todo pass # load individuals if 'ped_files' in project_spec: for relative_path in project_spec['ped_files']: ped_file_path = os.path.join(project_dir, relative_path) sample_management.update_project_from_fam(project, open(ped_file_path)) # todo: add awesome-slugify to above # set VCF files if 'vcf_files' in project_spec: for relative_path in project_spec['vcf_files']: vcf_file_path = os.path.join(project_dir, relative_path) # todo: this should be a fn somewhere that add_vcf_to_project uses too vcf_file = VCFFile.objects.get_or_create(file_path=vcf_file_path)[0] sample_management.add_vcf_file_to_project(project, vcf_file)
def load_project(self, project_id, json_path): #from collections import defaultdict #objects_by_pk = defaultdict(dict) print("------------------") project = Project.objects.get(project_id=project_id) users = {} families = {} cohorts = {} individuals = {} project_tags = {} project_phenotypes = {} gene_lists = {} with open(json_path) as f: contents = f.read() raw_json_data = json.loads(contents) #obj_generator = serializers.json.Deserializer(contents) # Couldn't find a way to make Deserializer return foreign key ids for obj in raw_json_data: #print("Object: " + str(obj)) obj_pk = obj['pk'] obj_model = obj['model'] obj_fields = obj['fields'] if obj_model == 'base.project': project = Project.objects.get(project_id=project_id) project.project_name = obj_fields['project_name'] project.description = obj_fields['description'] project.last_accessed_date = obj_fields['last_accessed_date'] if obj_fields['private_reference_populations']: #raise ValueError("private_reference_populations not implemented: " + str(obj_fields['private_reference_populations'])) pass if 'gene_lists' in obj_fields and obj_fields['gene_lists']: raise ValueError("gene_lists not implemented: " + str(project.gene_lists.all())) print("project: " + str(project)) project.save() elif obj_model == 'auth.user': try: user_queryset = User.objects.filter(email = obj_fields['email']) assert len(user_queryset) == 1 users[obj_pk] = user_queryset[0] except Exception, e: if obj_fields['username'] == 'monkol': users[obj_pk] = User.objects.get(email = '*****@*****.**') continue # users specific to this project #if not any(n in obj_fields['username'] for n in ["username1", "username2", ...]): # continue print("ERROR couldn't find user %s: %s %s" % (obj_pk, obj_fields, str(e))) if not obj_fields['email']: continue i = raw_input("Create this user? [y/n] ") if i.lower() != "y": continue print("Creating user: %s" % str(obj_fields)) matching_users = User.objects.filter( Q(email = obj_fields['email']) | Q(username=obj_fields['username']) ) if matching_users: assert len(matching_users) == 1 user = next(matching_users) else: user = User.objects.create(email = obj_fields['email'], username=obj_fields['username']) user.is_active = bool(obj_fields['is_active']) #user.is_superuser = bool(obj_fields['is_superuser']) #user.is_staff = bool(obj_fields['is_staff']) user.last_login = obj_fields['last_login'] user.groups = obj_fields['groups'] user.password = obj_fields['password'] user.date_joined = obj_fields['date_joined'] user.save() users[obj_pk] = user elif obj_model == 'base.projectcollaborator': collaborator, created = ProjectCollaborator.objects.get_or_create( project=project, user=users[obj_fields["user"]]) collaborator.collaborator_type = obj_fields['collaborator_type'] collaborator.save() elif obj_model == 'base.family': try: family = Family.objects.get(project=project, family_id=slugify(obj_fields['family_id'], separator='_')) except Exception, e: print("ERROR: family not found in local db: " + slugify(obj_fields['family_id'], separator='_')) continue
family_group, created = FamilyGroup.objects.get_or_create(project=project, slug=obj_fields['slug'], name=obj_fields['name'], description=obj_fields['description']) if not family_group.families.all(): for family_id in obj_fields['families']: family_group.families.add(families[family_id]) print("familygroup: " + str(family_group)) family_group.save() elif obj_model == 'base.familyimageslide': raise ValueError("FamilyImageSlide not implemented") elif obj_model == 'base.cohort': cohorts[obj_pk] = obj print("WARNING: Cohort not implemented. Won't deserialize: " + str(obj)) elif obj_model == "base.individual": obj_fields['indiv_id'] = slugify(obj_fields['indiv_id'], separator='_') try: individual = individuals[obj_pk] = Individual.objects.get(project=project, indiv_id=obj_fields['indiv_id']) except: print("ERROR: individual not found in local db: " + obj_fields['indiv_id']) continue print("individual: " + slugify(obj_fields['indiv_id'], separator='_')) individual.nickname = obj_fields['nickname'] individual.other_notes = obj_fields['other_notes'] individual.save() elif obj_model == "base.causalvariant": causal_variant, created = CausalVariant.objects.get_or_create( family = families[obj_fields["family"]], variant_type=obj_fields["variant_type"], xpos=obj_fields["xpos"],
import argparse import os from xbrowse.utils import slugify if __name__ == '__main__': parser = argparse.ArgumentParser(description='Convert any PED file to the xbrowse dialect') parser.add_argument('ped') args = parser.parse_args() filename = args.ped if not os.path.exists(filename): raise Exception('File does not exist') if '.' not in filename: raise Exception('Filename must have an extension.') out_filename = filename + '.xbrowse.ped' outfile = open(out_filename, 'w') for line in open(filename): fields = line.strip('\n').split('\t') for i in [2,3,4,5]: if fields[i] == '0': fields[i] = '.' for i in [0,1,2,3]: if fields[i] != '.': fields[i] = slugify(fields[i], separator='_') outfile.write('\t'.join(fields)+'\n') outfile.close()
name=obj_fields['name'], description=obj_fields['description']) if not family_group.families.all(): for family_id in obj_fields['families']: family_group.families.add(families[family_id]) print("familygroup: " + str(family_group)) family_group.save() elif obj_model == 'base.familyimageslide': raise ValueError("FamilyImageSlide not implemented") elif obj_model == 'base.cohort': cohorts[obj_pk] = obj print( "WARNING: Cohort not implemented. Won't deserialize: " + str(obj)) elif obj_model == "base.individual": obj_fields['indiv_id'] = slugify(obj_fields['indiv_id'], separator='_') try: individual = individuals[ obj_pk] = Individual.objects.get( project=project, indiv_id=obj_fields['indiv_id']) except: print("ERROR: individual not found in local db: " + obj_fields['indiv_id']) continue print("individual: " + slugify(obj_fields['indiv_id'], separator='_')) individual.nickname = obj_fields['nickname'] individual.other_notes = obj_fields['other_notes'] individual.save()
import argparse import os from xbrowse.utils import slugify if __name__ == '__main__': parser = argparse.ArgumentParser( description='Convert any PED file to the xbrowse dialect') parser.add_argument('ped') args = parser.parse_args() filename = args.ped if not os.path.exists(filename): raise Exception('File does not exist') if '.' not in filename: raise Exception('Filename must have an extension.') out_filename = filename + '.xbrowse.ped' outfile = open(out_filename, 'w') for line in open(filename): fields = line.strip('\n').split('\t') for i in [2, 3, 4, 5]: if fields[i] == '0': fields[i] = '.' for i in [0, 1, 2, 3]: if fields[i] != '.': fields[i] = slugify(fields[i], separator='_') outfile.write('\t'.join(fields) + '\n') outfile.close()
description=obj_fields['description']) if not family_group.families.all(): for family_id in obj_fields['families']: if family_id in families: family_group.families.add(families[family_id]) else: print("WARNING: family not found: " + family_id) print("familygroup: " + str(family_group)) family_group.save() elif obj_model == 'base.familyimageslide': raise ValueError("FamilyImageSlide not implemented") elif obj_model == 'base.cohort': cohorts[obj_pk] = obj print("WARNING: Cohort not implemented. Won't deserialize: " + str(obj)) elif obj_model == "base.individual": obj_fields['indiv_id'] = slugify(obj_fields['indiv_id'], separator='_') try: individual = individuals[obj_pk] = Individual.objects.get(project=project, indiv_id=obj_fields['indiv_id']) except: print("ERROR: individual not found in local db: " + obj_fields['indiv_id']) continue print("individual: " + slugify(obj_fields['indiv_id'], separator='_')) individual.nickname = obj_fields['nickname'] individual.other_notes = obj_fields['other_notes'] individual.save() elif obj_model == "base.causalvariant": causal_variant, created = CausalVariant.objects.get_or_create( family = families[obj_fields["family"]], variant_type=obj_fields["variant_type"], xpos=obj_fields["xpos"],
def handle(self, *args, **options): if len(args) < 2: print("Usage: ./manage.py load_project_dir <project_id> <project_path>") print("") sys.exit(1) project_id = args[0] try: project = Project.objects.get(project_id=project_id) except Project.DoesNotExist: print("\nError:") print("\nNo project could be found with id '%s'" % project_id) print("") print("Please use the add_project command first to add this project before loading it.") print("") sys.exit(1) project_dir = os.path.abspath(args[1]) project_yaml_file = os.path.join(project_dir, 'project.yaml') project_spec = yaml.load(open(project_yaml_file)) # load in sample IDs that we'll use for the project sample_id_file = os.path.join(project_dir, project_spec['sample_id_list']) sample_ids = [l.strip('\n') for l in open(sample_id_file)] sample_ids = [slugify(s, separator='_') for s in sample_ids] sample_management.add_indiv_ids_to_project(project, sample_ids) # set meta info project.project_name = project_spec['project_name'] project.save() # nicknames if 'nicknames' in project_spec: # todo pass # load individuals if 'ped_files' in project_spec: for relative_path in project_spec['ped_files']: ped_file_path = os.path.join(project_dir, relative_path) sample_management.update_project_from_fam(project, open(ped_file_path)) # todo: add awesome-slugify to above # set VCF files if 'vcf_files' in project_spec: for relative_path in project_spec['vcf_files']: vcf_file_path = os.path.join(project_dir, relative_path) # todo: this should be a fn somewhere that add_vcf_to_project uses too vcf_file = VCFFile.objects.get_or_create(file_path=vcf_file_path)[0] sample_management.add_vcf_file_to_project(project, vcf_file) if 'breakpoint_files' in project_spec: for relative_path in project_spec['breakpoint_files']: breakpoint_file = BreakpointFile() breakpoint_file.project = project breakpoint_file.file_path = os.path.join(project_dir, relative_path) breakpoint_file.save() print("Adding breakpoint file: %s" % breakpoint_file.file_path)
def load_project(self, project_id, json_path): #from collections import defaultdict #objects_by_pk = defaultdict(dict) print("------------------") project = Project.objects.get(project_id=project_id) users = {} families = {} cohorts = {} individuals = {} project_tags = {} project_phenotypes = {} gene_lists = {} with open(json_path) as f: contents = f.read() raw_json_data = json.loads(contents) #obj_generator = serializers.json.Deserializer(contents) # Couldn't find a way to make Deserializer return foreign key ids for obj in raw_json_data: #print("Object: " + str(obj)) obj_pk = obj['pk'] obj_model = obj['model'] obj_fields = obj['fields'] if obj_model == 'base.project': project = Project.objects.get(project_id=project_id) project.project_name = obj_fields['project_name'] project.description = obj_fields['description'] project.last_accessed_date = obj_fields['last_accessed_date'] if obj_fields['private_reference_populations']: #raise ValueError("private_reference_populations not implemented: " + str(obj_fields['private_reference_populations'])) pass if 'gene_lists' in obj_fields and obj_fields['gene_lists']: raise ValueError("gene_lists not implemented: " + str(project.gene_lists.all())) print("project: " + str(project)) project.save() elif obj_model == 'auth.user': try: user_queryset = User.objects.filter(email = obj_fields['email']) assert len(user_queryset) == 1 users[obj_pk] = user_queryset[0] except Exception, e: if obj_fields['username'] == 'monkol': users[obj_pk] = User.objects.get(email = '*****@*****.**') continue # users specific to this project #if not any(n in obj_fields['username'] for n in ["username1", "username2", ...]): # continue print("ERROR couldn't find user %s: %s %s" % (obj_pk, obj_fields, str(e))) if not obj_fields['email']: continue i = raw_input("Create this user? [y/n] ") if i.lower() != "y": continue print("Creating user: %s" % str(obj_fields)) matching_users = User.objects.filter( Q(email = obj_fields['email']) | Q(username=obj_fields['username']) ) if matching_users: assert len(matching_users) == 1 user = next(matching_users) else: user = User.objects.create(email = obj_fields['email'], username=obj_fields['username']) user.is_active = bool(obj_fields['is_active']) #user.is_superuser = bool(obj_fields['is_superuser']) #user.is_staff = bool(obj_fields['is_staff']) user.last_login = obj_fields['last_login'] user.groups = obj_fields['groups'] user.password = obj_fields['password'] user.date_joined = obj_fields['date_joined'] user.save() users[obj_pk] = user elif obj_model == 'base.projectcollaborator': collaborator, created = ProjectCollaborator.objects.get_or_create( project=project, user=users[obj_fields["user"]]) collaborator.collaborator_type = obj_fields['collaborator_type'] collaborator.save() elif obj_model == 'base.family': try: family = Family.objects.get(project=project, family_id=slugify(obj_fields['family_id'], separator='_')) except Exception, e: print("ERROR: family not found in local db: " + slugify(obj_fields['family_id'], separator='_')) continue