Exemplo n.º 1
0
def add_gene_level_features(session, ref, reference_features):
    feature_id = 1
    for gene, features in reference_features[ref.name].items():
        parent_id = feature_id
        for feature_type, feature in features.items():
            if feature_type == 'gene':
                add_feature_to_ref(feature['gene'], 'gene', 'V-GENE',
                                   feature['ref_seq'], 'gene',
                                   feature['start'], feature['end'], '+',
                                   f"Name={feature['gene']};ID={feature_id}",
                                   feature_id, ref)
                feature_id += 1
                add_feature_to_ref(feature['gene'], 'gene', 'V-GENE',
                                   feature['ref_seq'], 'mRNA',
                                   feature['start'], feature['end'], '+',
                                   f"Name={feature['gene']};ID={feature_id}",
                                   parent_id, ref)
                # TODO - make bed files use IMGT names
            elif feature_type == 'nonamer':
                add_gene_level_subfeature(feature, 'V-NONAMER', 'IGHVNona',
                                          feature_id, parent_id, ref)
            elif feature_type == 'spacer':
                add_gene_level_subfeature(feature, 'V-SPACER', 'IGHVSpacer',
                                          feature_id, parent_id, ref)
            elif feature_type == 'heptamer':
                add_gene_level_subfeature(feature, 'V-HEPTAMER', 'IGHVHepta',
                                          feature_id, parent_id, ref)
                # TODO - split bed exon_2 into L_PART1, V-REGION
            elif feature_type == 'exon_2':
                add_gene_level_subfeature(feature, 'exon_2', 'IGHVRegion',
                                          feature_id, parent_id, ref)
            elif feature_type == 'gencode_intron':
                add_gene_level_subfeature(feature, 'V-INTRON', 'IGHVIntron',
                                          feature_id, parent_id, ref)
            elif feature_type == 'exon_1':
                add_gene_level_subfeature(feature, 'L_PART-1', 'IGHVLP1',
                                          feature_id, parent_id, ref)

        feature_id += 1
Exemplo n.º 2
0
def add_feature(feature, bed_name, reference_features, row, seq, session,
                subject):
    if not row[feature]:
        return

    feature_seq = find_sequence_by_sequence(session, feature, seq.gene.name,
                                            row[feature])

    if not feature_seq:
        feature_name = f"{seq.gene.name}*{sha256(row[feature].encode('utf-8')).hexdigest()[-4:]}"
        feature_seq = save_genomic_sequence(session, feature_name,
                                            seq.gene.name, feature, True,
                                            False, '', row[feature], '')

    update_subject_sequence_link(session,
                                 int(row['haplotype'].replace('h=', '')),
                                 subject, feature_seq)

    feature_rec = find_feature_by_name(session, feature, feature_seq.name,
                                       subject.ref_seq)

    if not feature_rec:
        feature_id = session.query(Feature).count()
        start = reference_features[subject.ref_seq.name][
            seq.gene.name][bed_name]['start']

        if feature != 'L-PART2':
            end = reference_features[subject.ref_seq.name][
                seq.gene.name][bed_name]['end']
        else:
            end = reference_features[subject.ref_seq.name][
                seq.gene.name][bed_name]['start'] + 11

        feature_rec = add_feature_to_ref(
            feature_seq.name, 'allele', feature, feature_seq.sequence, 'UTR',
            start, end, '+', f"Name={feature_seq.name};ID={feature_id}",
            feature_id, subject.ref_seq)

    link_sequence_to_feature(feature_seq, feature_rec)
Exemplo n.º 3
0
def add_gene_level_subfeature(feature, imgt_feature_name, name_prefix,
                              feature_id, parent_id, ref):
    if imgt_feature_name == 'exon_2':
        name = f"IGHVRegion{feature['gene'].replace('IGHV', '')}*{sha256(feature['ref_seq'].encode('utf-8')).hexdigest()[-4:]}"
        add_feature_to_ref(name, 'gene', 'V-REGION', feature['ref_seq'], 'CDS',
                           feature['start'] + 11, feature['end'], '+',
                           f"Name={feature['gene']}_V-REGION;ID={feature_id}",
                           parent_id, ref)

        name = f"IGHVLpart2{feature['gene'].replace('IGHV', '')}*{sha256(feature['ref_seq'].encode('utf-8')).hexdigest()[-4:]}"
        add_feature_to_ref(name, 'gene', 'L_PART-2', feature['ref_seq'], 'CDS',
                           feature['start'], feature['start'] + 11, '+',
                           f"Name={feature['gene']}_L_PART1;ID={feature_id}",
                           parent_id, ref)
    else:
        name = f"{name_prefix}{feature['gene'].replace('IGHV', '')}*{sha256(feature['ref_seq'].encode('utf-8')).hexdigest()[-4:]}"
        add_feature_to_ref(
            name, 'gene', imgt_feature_name, feature['ref_seq'], 'CDS',
            feature['start'], feature['end'], '+',
            f"Name={feature['gene']}_{imgt_feature_name};ID={feature_id}",
            parent_id, ref)
Exemplo n.º 4
0
def process_igenotyper_record(session, species, dataset_dir, subject,
                              annotation_file, reference_features):
    global annotation_records

    print(f"Importing subject {subject.identifier}")

    if not os.path.isfile(os.path.join(dataset_dir, 'samples')):
        shutil.copy(annotation_file, os.path.join(dataset_dir, 'samples'))

    if annotation_file not in annotation_records:
        annotation_records[annotation_file] = read_csv(annotation_file)

    rows = [
        x for x in annotation_records[annotation_file]
        if x['sample_name'] == subject.name_in_study
    ]

    sense = '+'  # by + sense we mean 5' to 3'
    feature_id = 1

    for row in rows:
        chain = row['genotyper_gene'][3]

        if chain == 'V':
            seq = find_allele_by_name(session, row['vdjbase_allele'])

            if not seq:
                seq = save_novel_allele(session, row['genotyper_gene'],
                                        row['vdjbase_allele'],
                                        row['notes'].replace('\\n', '\r\n'),
                                        row['V-REGION'],
                                        row['V-REGION-GAPPED'])

            update_subject_sequence_link(
                session, int(row['haplotype'].replace('h=', '')), subject, seq)
            feature = find_feature_by_name(session, 'V-REGION', seq.name,
                                           subject.ref_seq)

            if feature and seq.sequence != feature.feature_seq:
                print(
                    f'Error: feature {feature.name} sequence does not match that of sequence {seq.name} in subject {subject.identifier}'
                )

            if not feature:
                feature_id = session.query(Feature).count()
                start = reference_features[subject.ref_seq.name][
                    seq.gene.name]['exon_2']['start'] + 11
                end = reference_features[subject.ref_seq.name][
                    seq.gene.name]['exon_2']['end']
                feature = add_feature_to_ref(
                    seq.name, 'allele', 'V-REGION', seq.sequence, 'CDS', start,
                    end, '+', f"Name={seq.name}_V-REGION;ID={feature_id}",
                    feature_id, subject.ref_seq)

            link_sequence_to_feature(seq, feature)

            # TODO - use IMGT names in bed files and get rid of one of these arguments

            add_feature('V-NONAMER', 'nonamer', reference_features, row, seq,
                        session, subject)
            add_feature('V-SPACER', 'spacer', reference_features, row, seq,
                        session, subject)
            add_feature('V-HEPTAMER', 'heptamer', reference_features, row, seq,
                        session, subject)
            add_feature('L-PART2', 'exon_2', reference_features, row, seq,
                        session, subject)
            add_feature('V-INTRON', 'gencode_intron', reference_features, row,
                        seq, session, subject)
            add_feature('L-PART1', 'exon_1', reference_features, row, seq,
                        session, subject)
Exemplo n.º 5
0
def process_imgt_assembly(session, sample_data):
    needed_items = ['Assembly_id', 'Assembly_reference']

    valid_entry = True
    for needed_item in needed_items:
        if needed_item not in sample_data:
            print('%s not specified' % (needed_item))
            valid_entry = False

    if not valid_entry:
        return

    print("Importing %s / %s" %
          (sample_data['Species'], sample_data['Sample']))

    page = requests.get(sample_data['URL'])
    tree = html.fromstring(page.content)

    seq_text = tree.xpath('//div[@class="sequence"]/pre')[0]

    sequence = ''
    for row in seq_text.text.split('\n'):
        if len(row) > 75:
            sequence += row[1:70].replace(' ', '')

    sp, data_set = save_genomic_dataset_details(sample_data['Locus'],
                                                sample_data['Dataset'],
                                                sample_data['Species'])
    ref_seq = save_genomic_ref_seq(sample_data['Locus'],
                                   sample_data['Assembly_id'], sp, sequence,
                                   sample_data['Assembly_reference'])
    db.session.commit()
    study = save_genomic_study(sample_data['Sample'], sample_data['Institute'],
                               sample_data['Researcher'],
                               sample_data['Reference'],
                               sample_data['Contact'])

    sample = save_genomic_sample(sample_data['Sample'], sample_data['Type'],
                                 sample_data['Date'], study, sp.id, ref_seq.id,
                                 data_set.id, sample_data['URL'])

    features = tree.xpath('//div[@class="features"]/table')[0]
    rows = iter(features)

    state = None
    name = None
    gene_range = None
    strand = None
    parent_id = 0

    for row in rows:
        values = [col.text for col in row]

        if len(values) < 3:
            continue

        def get_range(s):
            gene_range = s.split('..')
            if len(gene_range) < 2:
                print('Invalid gene range found: %s' % s)
                return (('1', '1'), '+')

            for i in (0, 1):
                gene_range[i] = gene_range[i].replace('>', '').replace('<', '')

            strand = '+'

            if 'complement(' in gene_range[0]:
                gene_range[0] = gene_range[0].replace('complement(', '')
                gene_range[1] = gene_range[1].replace(')', '')
                strand = '-'

            try:
                if int(gene_range[1]) - int(gene_range[0]) > 10000000 or int(
                        gene_range[0]) > int(gene_range[1]):
                    print('Invalid gene range found: %s' % s)
                    return ('1', '1'), '+'
            except:
                print('Invalid gene range found: %s' % s)
                return ('1', '1'), '+'

            return (gene_range, strand)

        if not state and values[0] in ['V-GENE', 'D-GENE', 'J-GENE']:
            gene_range, strand = get_range(values[2])
            state = values[0]

        elif state and not name:
            if values[1] == 'IMGT_allele':
                parent_id += 1
                name = values[2].split('*')[0]
                full_name = values[2]
                add_feature_to_ref(name, 'gene', gene_range[0], gene_range[1],
                                   strand, 'Name=%s;ID=%s' % (name, parent_id),
                                   parent_id, ref_seq)

                parent_id += 1
                add_feature_to_ref(name, 'mRNA', gene_range[0], gene_range[1],
                                   strand, 'Name=%s;ID=%s' % (name, parent_id),
                                   parent_id - 1, ref_seq)

        elif state and name:
            if (state == 'V-GENE' and values[0] in ["5'UTR", 'L-PART1', 'V-INTRON', 'L-PART2', 'V-REGION', "3'UTR"]) \
                    or (state == 'D-GENE' and values[0] in ["5'UTR", 'D-REGION', "3'UTR"]) \
                    or (state == 'J-GENE' and values[0] in ["5'UTR", 'D-REGION', "3'UTR"]):
                gene_range, strand = get_range(values[2])

                if 'REGION' in values[0]:
                    seq_name = full_name
                    imgt_name = full_name
                else:
                    seq_name = name + '_' + values[0]
                    imgt_name = ''

                f = add_feature_to_ref(
                    seq_name, feature_type[values[0]], gene_range[0],
                    gene_range[1], strand,
                    'Name=%s;Parent=%s' % (name + '_' + values[0], parent_id),
                    parent_id - 1, ref_seq)
                s = save_genomic_sequence(
                    seq_name, imgt_name, values[0], False, False, 'U',
                    ref_seq.sequence[int(gene_range[0]) -
                                     1:int(gene_range[1])], '', sp)

                s.features.append(f)
                SampleSequence(sample=sample,
                               sequence=s,
                               chromosome='h1,h2',
                               chromo_count=2)

        if state and name and values[0] == "3'UTR":
            state = None
            name = None

    db.session.commit()
    return '\n'.join(results)