def add_gene_level_features(session, ref, reference_features): feature_id = 1 for gene, features in reference_features[ref.name].items(): parent_id = feature_id for feature_type, feature in features.items(): if feature_type == 'gene': add_feature_to_ref(feature['gene'], 'gene', 'V-GENE', feature['ref_seq'], 'gene', feature['start'], feature['end'], '+', f"Name={feature['gene']};ID={feature_id}", feature_id, ref) feature_id += 1 add_feature_to_ref(feature['gene'], 'gene', 'V-GENE', feature['ref_seq'], 'mRNA', feature['start'], feature['end'], '+', f"Name={feature['gene']};ID={feature_id}", parent_id, ref) # TODO - make bed files use IMGT names elif feature_type == 'nonamer': add_gene_level_subfeature(feature, 'V-NONAMER', 'IGHVNona', feature_id, parent_id, ref) elif feature_type == 'spacer': add_gene_level_subfeature(feature, 'V-SPACER', 'IGHVSpacer', feature_id, parent_id, ref) elif feature_type == 'heptamer': add_gene_level_subfeature(feature, 'V-HEPTAMER', 'IGHVHepta', feature_id, parent_id, ref) # TODO - split bed exon_2 into L_PART1, V-REGION elif feature_type == 'exon_2': add_gene_level_subfeature(feature, 'exon_2', 'IGHVRegion', feature_id, parent_id, ref) elif feature_type == 'gencode_intron': add_gene_level_subfeature(feature, 'V-INTRON', 'IGHVIntron', feature_id, parent_id, ref) elif feature_type == 'exon_1': add_gene_level_subfeature(feature, 'L_PART-1', 'IGHVLP1', feature_id, parent_id, ref) feature_id += 1
def add_feature(feature, bed_name, reference_features, row, seq, session, subject): if not row[feature]: return feature_seq = find_sequence_by_sequence(session, feature, seq.gene.name, row[feature]) if not feature_seq: feature_name = f"{seq.gene.name}*{sha256(row[feature].encode('utf-8')).hexdigest()[-4:]}" feature_seq = save_genomic_sequence(session, feature_name, seq.gene.name, feature, True, False, '', row[feature], '') update_subject_sequence_link(session, int(row['haplotype'].replace('h=', '')), subject, feature_seq) feature_rec = find_feature_by_name(session, feature, feature_seq.name, subject.ref_seq) if not feature_rec: feature_id = session.query(Feature).count() start = reference_features[subject.ref_seq.name][ seq.gene.name][bed_name]['start'] if feature != 'L-PART2': end = reference_features[subject.ref_seq.name][ seq.gene.name][bed_name]['end'] else: end = reference_features[subject.ref_seq.name][ seq.gene.name][bed_name]['start'] + 11 feature_rec = add_feature_to_ref( feature_seq.name, 'allele', feature, feature_seq.sequence, 'UTR', start, end, '+', f"Name={feature_seq.name};ID={feature_id}", feature_id, subject.ref_seq) link_sequence_to_feature(feature_seq, feature_rec)
def add_gene_level_subfeature(feature, imgt_feature_name, name_prefix, feature_id, parent_id, ref): if imgt_feature_name == 'exon_2': name = f"IGHVRegion{feature['gene'].replace('IGHV', '')}*{sha256(feature['ref_seq'].encode('utf-8')).hexdigest()[-4:]}" add_feature_to_ref(name, 'gene', 'V-REGION', feature['ref_seq'], 'CDS', feature['start'] + 11, feature['end'], '+', f"Name={feature['gene']}_V-REGION;ID={feature_id}", parent_id, ref) name = f"IGHVLpart2{feature['gene'].replace('IGHV', '')}*{sha256(feature['ref_seq'].encode('utf-8')).hexdigest()[-4:]}" add_feature_to_ref(name, 'gene', 'L_PART-2', feature['ref_seq'], 'CDS', feature['start'], feature['start'] + 11, '+', f"Name={feature['gene']}_L_PART1;ID={feature_id}", parent_id, ref) else: name = f"{name_prefix}{feature['gene'].replace('IGHV', '')}*{sha256(feature['ref_seq'].encode('utf-8')).hexdigest()[-4:]}" add_feature_to_ref( name, 'gene', imgt_feature_name, feature['ref_seq'], 'CDS', feature['start'], feature['end'], '+', f"Name={feature['gene']}_{imgt_feature_name};ID={feature_id}", parent_id, ref)
def process_igenotyper_record(session, species, dataset_dir, subject, annotation_file, reference_features): global annotation_records print(f"Importing subject {subject.identifier}") if not os.path.isfile(os.path.join(dataset_dir, 'samples')): shutil.copy(annotation_file, os.path.join(dataset_dir, 'samples')) if annotation_file not in annotation_records: annotation_records[annotation_file] = read_csv(annotation_file) rows = [ x for x in annotation_records[annotation_file] if x['sample_name'] == subject.name_in_study ] sense = '+' # by + sense we mean 5' to 3' feature_id = 1 for row in rows: chain = row['genotyper_gene'][3] if chain == 'V': seq = find_allele_by_name(session, row['vdjbase_allele']) if not seq: seq = save_novel_allele(session, row['genotyper_gene'], row['vdjbase_allele'], row['notes'].replace('\\n', '\r\n'), row['V-REGION'], row['V-REGION-GAPPED']) update_subject_sequence_link( session, int(row['haplotype'].replace('h=', '')), subject, seq) feature = find_feature_by_name(session, 'V-REGION', seq.name, subject.ref_seq) if feature and seq.sequence != feature.feature_seq: print( f'Error: feature {feature.name} sequence does not match that of sequence {seq.name} in subject {subject.identifier}' ) if not feature: feature_id = session.query(Feature).count() start = reference_features[subject.ref_seq.name][ seq.gene.name]['exon_2']['start'] + 11 end = reference_features[subject.ref_seq.name][ seq.gene.name]['exon_2']['end'] feature = add_feature_to_ref( seq.name, 'allele', 'V-REGION', seq.sequence, 'CDS', start, end, '+', f"Name={seq.name}_V-REGION;ID={feature_id}", feature_id, subject.ref_seq) link_sequence_to_feature(seq, feature) # TODO - use IMGT names in bed files and get rid of one of these arguments add_feature('V-NONAMER', 'nonamer', reference_features, row, seq, session, subject) add_feature('V-SPACER', 'spacer', reference_features, row, seq, session, subject) add_feature('V-HEPTAMER', 'heptamer', reference_features, row, seq, session, subject) add_feature('L-PART2', 'exon_2', reference_features, row, seq, session, subject) add_feature('V-INTRON', 'gencode_intron', reference_features, row, seq, session, subject) add_feature('L-PART1', 'exon_1', reference_features, row, seq, session, subject)
def process_imgt_assembly(session, sample_data): needed_items = ['Assembly_id', 'Assembly_reference'] valid_entry = True for needed_item in needed_items: if needed_item not in sample_data: print('%s not specified' % (needed_item)) valid_entry = False if not valid_entry: return print("Importing %s / %s" % (sample_data['Species'], sample_data['Sample'])) page = requests.get(sample_data['URL']) tree = html.fromstring(page.content) seq_text = tree.xpath('//div[@class="sequence"]/pre')[0] sequence = '' for row in seq_text.text.split('\n'): if len(row) > 75: sequence += row[1:70].replace(' ', '') sp, data_set = save_genomic_dataset_details(sample_data['Locus'], sample_data['Dataset'], sample_data['Species']) ref_seq = save_genomic_ref_seq(sample_data['Locus'], sample_data['Assembly_id'], sp, sequence, sample_data['Assembly_reference']) db.session.commit() study = save_genomic_study(sample_data['Sample'], sample_data['Institute'], sample_data['Researcher'], sample_data['Reference'], sample_data['Contact']) sample = save_genomic_sample(sample_data['Sample'], sample_data['Type'], sample_data['Date'], study, sp.id, ref_seq.id, data_set.id, sample_data['URL']) features = tree.xpath('//div[@class="features"]/table')[0] rows = iter(features) state = None name = None gene_range = None strand = None parent_id = 0 for row in rows: values = [col.text for col in row] if len(values) < 3: continue def get_range(s): gene_range = s.split('..') if len(gene_range) < 2: print('Invalid gene range found: %s' % s) return (('1', '1'), '+') for i in (0, 1): gene_range[i] = gene_range[i].replace('>', '').replace('<', '') strand = '+' if 'complement(' in gene_range[0]: gene_range[0] = gene_range[0].replace('complement(', '') gene_range[1] = gene_range[1].replace(')', '') strand = '-' try: if int(gene_range[1]) - int(gene_range[0]) > 10000000 or int( gene_range[0]) > int(gene_range[1]): print('Invalid gene range found: %s' % s) return ('1', '1'), '+' except: print('Invalid gene range found: %s' % s) return ('1', '1'), '+' return (gene_range, strand) if not state and values[0] in ['V-GENE', 'D-GENE', 'J-GENE']: gene_range, strand = get_range(values[2]) state = values[0] elif state and not name: if values[1] == 'IMGT_allele': parent_id += 1 name = values[2].split('*')[0] full_name = values[2] add_feature_to_ref(name, 'gene', gene_range[0], gene_range[1], strand, 'Name=%s;ID=%s' % (name, parent_id), parent_id, ref_seq) parent_id += 1 add_feature_to_ref(name, 'mRNA', gene_range[0], gene_range[1], strand, 'Name=%s;ID=%s' % (name, parent_id), parent_id - 1, ref_seq) elif state and name: if (state == 'V-GENE' and values[0] in ["5'UTR", 'L-PART1', 'V-INTRON', 'L-PART2', 'V-REGION', "3'UTR"]) \ or (state == 'D-GENE' and values[0] in ["5'UTR", 'D-REGION', "3'UTR"]) \ or (state == 'J-GENE' and values[0] in ["5'UTR", 'D-REGION', "3'UTR"]): gene_range, strand = get_range(values[2]) if 'REGION' in values[0]: seq_name = full_name imgt_name = full_name else: seq_name = name + '_' + values[0] imgt_name = '' f = add_feature_to_ref( seq_name, feature_type[values[0]], gene_range[0], gene_range[1], strand, 'Name=%s;Parent=%s' % (name + '_' + values[0], parent_id), parent_id - 1, ref_seq) s = save_genomic_sequence( seq_name, imgt_name, values[0], False, False, 'U', ref_seq.sequence[int(gene_range[0]) - 1:int(gene_range[1])], '', sp) s.features.append(f) SampleSequence(sample=sample, sequence=s, chromosome='h1,h2', chromo_count=2) if state and name and values[0] == "3'UTR": state = None name = None db.session.commit() return '\n'.join(results)