def annotate_sequence(seqrecord, features=['gene', 'RNA structure', 'other']): '''Annotate a consensus with the genes and stuff (in place)''' from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation from hivwholeseq.utils.genome_info import gene_edges, RNA_structure_edges, \ other_edges, find_region_edges, find_region_edges_multiple from hivwholeseq.data.primers import primers_PCR as primers_PCR_edges edge_dict = {'gene': gene_edges, 'RNA structure': RNA_structure_edges, 'PCR primers': primers_PCR_edges, 'other': other_edges} smat = np.array(seqrecord) for feature_type in features: edges_all = edge_dict[feature_type] for name, edges in edges_all.iteritems(): # Skip a feature if it's present already if name in map(lambda x: x.id, seqrecord.features): continue # Behave differently for unsplit regions and split ones if len(edges) == 2: # LTR problems with F6 if 'F6' in name: pos_edge = find_region_edges(smat[::-1], [edges[1][::-1], edges[0][::-1]]) pos_edge = [len(smat) - 1 - pos_edge[1], len(smat) - 1 - pos_edge[0]] else: pos_edge = find_region_edges(smat, edges) location = FeatureLocation(*pos_edge) else: pos_edges = find_region_edges_multiple(smat, edges) locations = [FeatureLocation(*pos_edge) for pos_edge in pos_edges] location = CompoundLocation(locations) feature = SeqFeature(location, type=feature_type, id=name, strand=1) seqrecord.features.append(feature)
def annotate_sequence(seqrecord, features=['gene', 'RNA structure', 'other']): '''Annotate a consensus with the genes and stuff (in place)''' from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation from hivwholeseq.utils.genome_info import gene_edges, RNA_structure_edges, \ other_edges, find_region_edges, find_region_edges_multiple from hivwholeseq.data.primers import primers_PCR as primers_PCR_edges edge_dict = { 'gene': gene_edges, 'RNA structure': RNA_structure_edges, 'PCR primers': primers_PCR_edges, 'other': other_edges } smat = np.array(seqrecord) for feature_type in features: edges_all = edge_dict[feature_type] for name, edges in edges_all.iteritems(): # Skip a feature if it's present already if name in map(lambda x: x.id, seqrecord.features): continue # Behave differently for unsplit regions and split ones if len(edges) == 2: # LTR problems with F6 if 'F6' in name: pos_edge = find_region_edges( smat[::-1], [edges[1][::-1], edges[0][::-1]]) pos_edge = [ len(smat) - 1 - pos_edge[1], len(smat) - 1 - pos_edge[0] ] else: pos_edge = find_region_edges(smat, edges) location = FeatureLocation(*pos_edge) else: pos_edges = find_region_edges_multiple(smat, edges) locations = [ FeatureLocation(*pos_edge) for pos_edge in pos_edges ] location = CompoundLocation(locations) feature = SeqFeature(location, type=feature_type, id=name, strand=1) seqrecord.features.append(feature)
def annotate_sequence(seqrecord, additional_edges={}, additional_features=['chunk'], VERBOSE=0): '''Annotate a consensus with the genes and stuff (in place)''' # TODO: what do we do with genes that do not start/end where they are # supposed to? Do we follow biology and track their new locations? from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation from hivwholeseq.utils.genome_info import gene_edges, RNA_structure_edges, \ other_edges, find_region_edges, find_region_edges_multiple, \ locate_gene edge_dict = {'gene': gene_edges, 'RNA structure': RNA_structure_edges, 'other': other_edges} edge_dict.update(additional_edges) additional_features = ['protein'] + additional_features features = edge_dict.keys() + additional_features if VERBOSE: print 'Features:', ', '.join(features) smat = np.array(seqrecord) for feature_type in edge_dict: edges_all = edge_dict[feature_type] print feature_type, edge_dict[feature_type].keys() for name, edges in edges_all.iteritems(): if VERBOSE >= 2: print name, # Skip a feature if it's present already if name in map(lambda x: x.id, seqrecord.features): if VERBOSE >= 2: print 'already present.' continue # Behave differently for unsplit regions and split ones if len(edges) == 2: # LTR problems with F6 if 'F6' in name: pos_edge = find_region_edges(smat[6000::], [edges[0], None]) pos_edge[0] += 6000 elif feature_type == 'genes': pos_edge = locate_gene(smat, name, output_compact=True) else: pos_edge = find_region_edges(smat, edges) # Cut the primers for some features if (None not in pos_edge) and name in ['V1', 'V3', 'V4', 'V5']: pos_edge[0] += len(edges[0]) pos_edge[1] -= len(edges[1]) # Cut only the right primer for V2 if (None not in pos_edge) and name in ['V2']: pos_edge[1] -= len(edges[1]) if pos_edge[0] is None: if name not in ['F1', "LTR5'"]: print 'WARNING: start not found' pos_edge[0] = 0 if pos_edge[1] is None: if name not in ['F6', "LTR3'"]: print 'WARNING: end not found' pos_edge[1] = len(smat) location = FeatureLocation(*pos_edge) else: if feature_type == 'genes': pos_edges = [locate_gene(smat, name+suff, output_compact=True) for suff in ('1', '2')] else: pos_edges = find_region_edges_multiple(smat, edges, min_distance=1) locations = [FeatureLocation(*pos_edge) for pos_edge in pos_edges] location = CompoundLocation(locations) if VERBOSE >= 2: print 'found:', location feature = SeqFeature(location, type=feature_type, id=name, strand=1) seqrecord.features.append(feature) # Add proteins and other features from HXB2 from operator import attrgetter from seqanpy import align_overlap from hivwholeseq.utils.genome_info import proteins, chunks from hivwholeseq.reference import load_custom_reference additional_features_dict = {} if 'protein' in additional_features: additional_features_dict['protein'] = proteins if 'chunk' in additional_features: additional_features_dict['chunk'] = chunks ref_ann = load_custom_reference('HXB2', 'gb') for feagroup, additional_features_grp in additional_features_dict.iteritems(): for feaname in additional_features_grp: if VERBOSE >= 2: print feaname, fea = ref_ann.features[map(attrgetter('id'), ref_ann.features).index(feaname)] seq = fea.extract(ref_ann) (score, ali1, ali2) = align_overlap(seqrecord, seq, score_gapopen=-20) start = len(ali2) - len(ali2.lstrip('-')) end = len(ali2.rstrip('-')) end -= ali1[start: end].count('-') location = FeatureLocation(start, end) if VERBOSE >= 2: print 'found:', location feature = SeqFeature(location, type=feagroup, id=feaname, strand=1) seqrecord.features.append(feature)
def annotate_sequence(seqrecord, additional_edges={}, additional_features=['chunk'], VERBOSE=0): '''Annotate a consensus with the genes and stuff (in place)''' # TODO: what do we do with genes that do not start/end where they are # supposed to? Do we follow biology and track their new locations? from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation from hivwholeseq.utils.genome_info import gene_edges, RNA_structure_edges, \ other_edges, find_region_edges, find_region_edges_multiple, \ locate_gene edge_dict = { 'gene': gene_edges, 'RNA structure': RNA_structure_edges, 'other': other_edges } edge_dict.update(additional_edges) additional_features = ['protein'] + additional_features features = edge_dict.keys() + additional_features if VERBOSE: print 'Features:', ', '.join(features) smat = np.array(seqrecord) for feature_type in edge_dict: edges_all = edge_dict[feature_type] print feature_type, edge_dict[feature_type].keys() for name, edges in edges_all.iteritems(): if VERBOSE >= 2: print name, # Skip a feature if it's present already if name in map(lambda x: x.id, seqrecord.features): if VERBOSE >= 2: print 'already present.' continue # Behave differently for unsplit regions and split ones if len(edges) == 2: # LTR problems with F6 if 'F6' in name: pos_edge = find_region_edges(smat[6000::], [edges[0], None]) pos_edge[0] += 6000 elif feature_type == 'genes': pos_edge = locate_gene(smat, name, output_compact=True) else: pos_edge = find_region_edges(smat, edges) # Cut the primers for some features if (None not in pos_edge) and name in ['V1', 'V3', 'V4', 'V5']: pos_edge[0] += len(edges[0]) pos_edge[1] -= len(edges[1]) # Cut only the right primer for V2 if (None not in pos_edge) and name in ['V2']: pos_edge[1] -= len(edges[1]) if pos_edge[0] is None: if name not in ['F1', "LTR5'"]: print 'WARNING: start not found' pos_edge[0] = 0 if pos_edge[1] is None: if name not in ['F6', "LTR3'"]: print 'WARNING: end not found' pos_edge[1] = len(smat) location = FeatureLocation(*pos_edge) else: if feature_type == 'genes': pos_edges = [ locate_gene(smat, name + suff, output_compact=True) for suff in ('1', '2') ] else: pos_edges = find_region_edges_multiple(smat, edges, min_distance=1) locations = [ FeatureLocation(*pos_edge) for pos_edge in pos_edges ] location = CompoundLocation(locations) if VERBOSE >= 2: print 'found:', location feature = SeqFeature(location, type=feature_type, id=name, strand=1) seqrecord.features.append(feature) # Add proteins and other features from HXB2 from operator import attrgetter from seqanpy import align_overlap from hivwholeseq.utils.genome_info import proteins, chunks from hivwholeseq.reference import load_custom_reference additional_features_dict = {} if 'protein' in additional_features: additional_features_dict['protein'] = proteins if 'chunk' in additional_features: additional_features_dict['chunk'] = chunks ref_ann = load_custom_reference('HXB2', 'gb') for feagroup, additional_features_grp in additional_features_dict.iteritems( ): for feaname in additional_features_grp: if VERBOSE >= 2: print feaname, fea = ref_ann.features[map(attrgetter('id'), ref_ann.features).index(feaname)] seq = fea.extract(ref_ann) (score, ali1, ali2) = align_overlap(seqrecord, seq, score_gapopen=-20) start = len(ali2) - len(ali2.lstrip('-')) end = len(ali2.rstrip('-')) end -= ali1[start:end].count('-') location = FeatureLocation(start, end) if VERBOSE >= 2: print 'found:', location feature = SeqFeature(location, type=feagroup, id=feaname, strand=1) seqrecord.features.append(feature)