Пример #1
0
    def test_get_region_codon_indeces_in_feature__positive_strand(self):
        """Test that the region captures the appropriate indeces in the feature.
        """
        feature_1_loc = FeatureLocation(0, 12, strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1')

        EXPECTED_RESULT = [0, 1]
        for region in [(0, 5), (0, 6), (1, 6)]:
            result = get_region_codon_indeces_in_feature(feature_1, region)
            self.assertEqual(set(EXPECTED_RESULT), set(result))

        EXPECTED_RESULT = [0, 1, 2]
        for region in [(0, 7), (1, 7)]:
            result = get_region_codon_indeces_in_feature(feature_1, region)
            self.assertEqual(set(EXPECTED_RESULT), set(result))

        feature_2_loc = FeatureLocation(1, 12, strand=1)
        feature_2 = SeqFeature(feature_2_loc, type='CDS', id='1')
        EXPECTED_RESULT = [0, 1]
        for region in [(0, 5), (0, 6), (1, 6), (0, 7), (1, 7)]:
            result = get_region_codon_indeces_in_feature(feature_2, region)
            self.assertEqual(set(EXPECTED_RESULT), set(result))

        EXPECTED_RESULT = [0, 1, 2]
        for region in [(0, 8), (1, 8)]:
            result = get_region_codon_indeces_in_feature(feature_2, region)
            self.assertEqual(set(EXPECTED_RESULT), set(result))
    def test_get_region_codon_indeces_in_feature__positive_strand(self):
        """Test that the region captures the appropriate indeces in the feature.
        """
        feature_1_loc = FeatureLocation(0, 12, strand=1)
        feature_1 = SeqFeature(feature_1_loc, type='CDS', id='1')

        EXPECTED_RESULT = [0, 1]
        for region in [(0, 5), (0, 6), (1, 6)]:
            result = get_region_codon_indeces_in_feature(feature_1, region)
            self.assertEqual(set(EXPECTED_RESULT), set(result))

        EXPECTED_RESULT = [0, 1, 2]
        for region in [(0, 7), (1, 7)]:
            result = get_region_codon_indeces_in_feature(feature_1, region)
            self.assertEqual(set(EXPECTED_RESULT), set(result))

        feature_2_loc = FeatureLocation(1, 12, strand=1)
        feature_2 = SeqFeature(feature_2_loc, type='CDS', id='1')
        EXPECTED_RESULT = [0, 1]
        for region in [(0, 5), (0, 6), (1, 6), (0, 7), (1, 7)]:
            result = get_region_codon_indeces_in_feature(feature_2, region)
            self.assertEqual(set(EXPECTED_RESULT), set(result))

        EXPECTED_RESULT = [0, 1, 2]
        for region in [(0, 8), (1, 8)]:
            result = get_region_codon_indeces_in_feature(feature_2, region)
            self.assertEqual(set(EXPECTED_RESULT), set(result))
Пример #3
0
def does_feature_have_codon_in_list_region(feature,
                                           region,
                                           seq_record,
                                           codon_list,
                                           return_codon_index=False):
    """Checks the feature codons that fall in the region against
    the codon_list. If any match, returns True. Else False.
    """
    feature_seq = str(feature.extract(seq_record.seq))

    codon_indeces = get_region_codon_indeces_in_feature(feature, region)

    # Iterate through the codons, checking for any that are in target list.
    for codon_index in codon_indeces:
        codon = feature_seq[codon_index * 3:codon_index * 3 + 3]
        if codon in codon_list:
            if return_codon_index:
                return codon_index
            else:
                return True

    # No forbidden codons found.
    return False
Пример #4
0
def fix_gc_content(refactor_context,
                   gc_content_constraint_obj,
                   start_bound=None,
                   end_bound=None,
                   debug=False,
                   report_file=None):
    """Fixes the GC content according to desired constraints.

    Strategy:
        Slide a window across the genome and bump any regions that fall
        outside of the constraint. Now, there is some subtlety here in that
        we have a good idea of how to fix coding regions (i.e. synonymous
        codon swaps), but want to avoid messing with stuff outside of
        coding regions. And so when we identify a bad window, for now,
        we limit fixes to any coding portions of that window only.

    TODOs:
        * We are only dealing with local window for now. Figure out how we want
        to deal with global window.

    Args:
        refactor_context: The RefactorContext.
        gc_content_constraint_obj: A GCContentConstraints object that
            allows the client to configure the fixes.
        start_bound: Optionally bound fixes to start at this position.
        end_bound: Optionally bound fixes to end at this position.
        debug: Debug flag. Prints helpful output. For now, runs analysis only,
            and doesn't actually make changes.

    Returns:
        A copy of the genome_record contained within refactor_context
        with the GC content made to satisfy constraints.
    """
    print 'Fixing GC content...'
    updated_genome_record = copy.deepcopy(refactor_context.get_genome_record())

    # Figure out effective bounds.
    effective_start_bound = start_bound if start_bound else 0
    effective_end_bound = end_bound if end_bound else len(
        updated_genome_record)

    # Features that we can do synonymous swaps in
    swappable_features = [
        feature for feature in updated_genome_record.features
        if feature.type == 'CDS'
    ]

    # Slide the window looking for violations of GC content restrictions.
    window_center_range = range(
        effective_start_bound +
        gc_content_constraint_obj.local_window_size / 2,
        effective_end_bound - gc_content_constraint_obj.local_window_size / 2)
    report_intervals = []
    if debug:
        running_interval = None
        running_gc_total = 0
    for window_center_pos in window_center_range:
        window_start_pos = (window_center_pos -
                            gc_content_constraint_obj.local_window_size / 2)
        window_end_pos = (window_start_pos +
                          gc_content_constraint_obj.local_window_size)
        window_seq = updated_genome_record.seq[window_start_pos:window_end_pos]
        gc_content = GC(window_seq) / 100
        if (gc_content_constraint_obj.local_window_lower_bound <= gc_content <=
                gc_content_constraint_obj.local_window_upper_bound):
            # GC is all good.
            if debug:
                # Close the running interval and print it out.
                if running_interval:
                    interval_size = running_interval[1] - running_interval[
                        0] + 1
                    avg_gc = running_gc_total / interval_size
                    report_intervals.append({
                        'interval': str(running_interval),
                        'interval_size': interval_size,
                        'avg_gc': avg_gc,
                    })
                    print('%s, size: %d, average_gc: %f' %
                          (str(running_interval), interval_size, avg_gc))
                    running_interval = None
                    running_gc_total = 0
            continue

        if debug:
            if not running_interval:
                running_interval = (window_center_pos, window_center_pos)
            else:
                running_interval = (running_interval[0], window_center_pos)
            running_gc_total += gc_content
            continue

        # As a first stab, only attempt fixes in the simplest of cases.
        # That is, only do synonymous codon swaps within parts of features
        # that are not overlapping.

        # First identify all features overlaped by the interval.
        interval = (window_start_pos, window_end_pos)
        overlapped_features = calc_interval_list_to_features_overlapped(
            [interval], swappable_features)[0]
        if len(overlapped_features) != 1:
            # TODO: Eventually handle more complex cases.
            continue

        # Otherwise attempt to fix.
        feature = overlapped_features[0]
        feature_seq = str(feature.extract(updated_genome_record.seq))

        # Figure out the specific codons that need to be changed.
        affected_codon_indeces = get_region_codon_indeces_in_feature(
            feature, interval)
        avoid_codons_in_positions = {}
        for codon_index in affected_codon_indeces:
            codon = feature_seq[codon_index * 3:codon_index * 3 + 3]
            if GC(codon) < 1.0:
                avoid_codons_in_positions[codon_index] = codon

        # Perform replace.
        first_codon_to_modify = affected_codon_indeces[0]
        last_codon_to_modify = affected_codon_indeces[-1]
        assert first_codon_to_modify <= last_codon_to_modify
        result = replace_codons_in_single_feature(
            refactor_context,
            feature.id,
            explicit_genome_record=updated_genome_record,
            start_codon_index=first_codon_to_modify,
            last_codon_index=last_codon_to_modify,
            avoid_codons_in_positions=avoid_codons_in_positions)
        if not result['is_success']:
            # TODO: Do something better for debugging here, although
            # we don't necessarily need each replace to succeed.
            continue

        update_seq_record_feature(updated_genome_record, feature.id, result)

    print '...Done.'

    if report_file:
        print 'Writing report.'
        REPORT_FIELDNAMES = [
            'interval',
            'interval_size',
            'avg_gc',
        ]
        with open(report_file, 'w') as report_fh:
            writer = csv.DictWriter(report_fh, REPORT_FIELDNAMES)
            writer.writeheader()
            for interval in report_intervals:
                writer.writerow(interval)

    return updated_genome_record
Пример #5
0
def _remove_homopolymer_run_in_coding_feature(
        refactor_context,
        mutable_genome_record,
        h_run_obj,
        feature):
    """Removes a homopolymer run in a coding feature.

    The strategy is to muddle up all affected codons within the feature
    so as to reduce the chance of "snap-back" over generations.

    Args:
        refactor_context: The RefactorContext.
        mutable_genome_record: The SeqRecord object representing the genome.
        h_run_obj: Object containing data about a specific occurrence of a
            homopolyer run.
        feature: The feature that is overlapped by the restriction site.

    Returns:
        Object with keys:
            * is_success: Whether remove succeeded.
            * updated_genome_record: The updated genome_record if successful.
            * exception_string: Message describing failure.
    """
    # First check if this is only a partial overlap and thus we can avoid
    # a heavy change.
    interval = h_run_obj['interval']
    interval_start = interval[0]
    interval_end = interval[1]
    interval_size = interval_end - interval_start

    # Otherwise commence the muddling strategy.
    feature_seq = str(feature.extract(mutable_genome_record.seq))

    # Figure out the specific codons that need to be changed.
    affected_codon_indeces = get_region_codon_indeces_in_feature(
            feature, interval)
    avoid_codons_in_positions = {}
    for codon_index in affected_codon_indeces:
        codon = feature_seq[codon_index * 3 : codon_index * 3 + 3]
        avoid_codons_in_positions[codon_index] = codon

    # Perform replace.
    first_codon_to_modify = affected_codon_indeces[0]
    last_codon_to_modify = affected_codon_indeces[-1]
    assert first_codon_to_modify <= last_codon_to_modify
    result = replace_codons_in_single_feature(
            refactor_context,
            feature.id,
            explicit_genome_record=mutable_genome_record,
            start_codon_index=first_codon_to_modify,
            last_codon_index=last_codon_to_modify,
            avoid_codons_in_positions=avoid_codons_in_positions)
    if not result['is_success']:
        return {
                'is_success': False,
                'exception_string': result['exception_string']
        }

    update_seq_record_feature(
            mutable_genome_record,
            feature.id,
            result
    )
    return {
            'is_success': True,
            'updated_genome_record': mutable_genome_record
    }
Пример #6
0
def _remove_site_in_coding_feature(
        refactor_context,
        mutable_genome_record,
        site_occur_obj,
        feature):
    """Removes a restriction site that falls in a region annotated as
    coding.

    The strategy is to muddle up all affected codons within the feature
    so as to reduce the chance of "snap-back" over generations.

    Args:
        refactor_context: The RefactorContext.
        mutable_genome_record: The SeqRecord object representing the genome.
        site_occur_obj: Object containing data about a specific occurrence of a
            restriction enzyme site in the genome.
        feature: The feature that is overlapped by the restriction site.

    Returns:
        Object with keys:
            * is_success: Whether remove succeeded.
            * updated_genome_record: The updated genome_record if successful.
            * exception_string: Message describing failure.
    """
    interval = site_occur_obj['interval']

    # Figure out the specific codons that need to be changed.
    affected_codon_indeces = get_region_codon_indeces_in_feature(
            feature, interval)
    avoid_codons_in_positions = {}
    feature_seq = str(feature.extract(mutable_genome_record.seq))
    for codon_index in affected_codon_indeces:
        codon = feature_seq[codon_index * 3 : codon_index * 3 + 3]
        avoid_codons_in_positions[codon_index] = codon

    # Perform replace.
    first_codon_to_modify = affected_codon_indeces[0]
    last_codon_to_modify = affected_codon_indeces[-1]
    assert first_codon_to_modify <= last_codon_to_modify
    result = replace_codons_in_single_feature(
            refactor_context,
            feature.id,
            explicit_genome_record=mutable_genome_record,
            start_codon_index=first_codon_to_modify,
            last_codon_index=last_codon_to_modify,
            avoid_codons_in_positions=avoid_codons_in_positions)
    if not result['is_success']:
        return {
                'is_success': False,
                'exception_string': result['exception_string']
        }

    update_seq_record_feature(
            mutable_genome_record,
            feature.id,
            result
    )
    return {
            'is_success': True,
            'updated_genome_record': mutable_genome_record
    }