def build_fragment(self): # pre-chunk the fragment sequence at feature start and end locations. # there should be no need to further divide any chunk during import. break_points = list(set( [f[0] for f in self.__features] + [f[1] + 1 for f in self.__features])) break_points = sorted(break_points) chunk_sizes = [] for i, bp in enumerate(break_points): if i == 0: if bp > 1: chunk_sizes.append(break_points[i] - 1) else: chunk_sizes.append(break_points[i] - break_points[i - 1]) print '%d chunks' % (len(chunk_sizes),) new_fragment = Fragment(name=self.__rec.id, circular=False, parent=None, start_chunk=None) new_fragment.save() new_fragment = new_fragment.indexed_fragment() prev = None flen = 0 seqlen = len(self.__sequence) for sz in chunk_sizes: prev = new_fragment._append_to_fragment(prev, flen, self.__sequence[flen:flen + sz]) flen += sz if flen < seqlen: f = new_fragment._append_to_fragment(prev, flen, self.__sequence[flen:seqlen]) return new_fragment
def build_fragment(self): # pre-chunk the fragment sequence at feature start and end locations. # there should be no need to further divide any chunk during import. break_points = list( set([f[0] for f in self.__features] + [f[1] + 1 for f in self.__features])) break_points = sorted(break_points) chunk_sizes = [] for i, bp in enumerate(break_points): if i == 0: if bp > 1: chunk_sizes.append(break_points[i] - 1) else: chunk_sizes.append(break_points[i] - break_points[i - 1]) print '%d chunks' % (len(chunk_sizes), ) new_fragment = Fragment(name=self.__rec.id, circular=False, parent=None, start_chunk=None) new_fragment.save() new_fragment = new_fragment.indexed_fragment() prev = None flen = 0 seqlen = len(self.__sequence) for sz in chunk_sizes: prev = new_fragment._append_to_fragment( prev, flen, self.__sequence[flen:flen + sz]) flen += sz if flen < seqlen: f = new_fragment._append_to_fragment(prev, flen, self.__sequence[flen:seqlen]) return new_fragment
def build_fragment(self, reference_based=True, dirn='.'): # pre-chunk the fragment sequence at feature start and end locations. # there should be no need to further divide any chunk during import. starts_and_ends = [] for feature in self.__features: name = feature[2] starts_and_ends.append(feature[0]) starts_and_ends.append(feature[1] + 1) for subfeature in self.__subfeatures_dict[name]: starts_and_ends.append(subfeature[0]) starts_and_ends.append(subfeature[1] + 1) break_points = sorted(list(set(starts_and_ends))) cur_len = 0 chunk_sizes = [] for i, bp in enumerate(break_points): if i == 0: if bp > 1: chunk_sizes.append(break_points[i] - 1) cur_len += chunk_sizes[-1] else: chunk_sizes.append(break_points[i] - break_points[i - 1]) cur_len += chunk_sizes[-1] if cur_len < self.__seqlen: chunk_sizes.append(self.__seqlen - cur_len) fragment_circular = False for feature in self.__rec.features: # skip features that cover the entire sequence if feature.type.upper() in ['REGION', 'CHR', 'CHROM', 'CHROMOSOME']: if 'Is_circular' in feature.qualifiers: fragment_circular = feature.qualifiers['Is_circular'][0].upper() == 'TRUE' break new_fragment = Fragment( name=self.__rec.id, circular=fragment_circular, parent=None, start_chunk=None ) new_fragment.save() print("Fragment %s" % (new_fragment.id)) new_fragment = new_fragment.indexed_fragment() if reference_based: print("%d chunks" % (len(chunk_sizes),)) t0 = time.time() Chunk.CHUNK_REFERENCE_CLASS.generate_from_fragment( new_fragment, str(self.__rec.seq), dirn=dirn ) print("Reference file generation took %s seconds" % (time.time() - t0)) new_fragment._bulk_create_fragment_chunks(chunk_sizes) return new_fragment # divide chunks bigger than a certain threshold to smaller chunks, to # allow insertion of sequence into database. e.g. MySQL has a packet # size that prevents chunks that are too large from being inserted. chunk_size_limit = 1000000 new_chunk_sizes = [] for original_chunk_size in chunk_sizes: if original_chunk_size < chunk_size_limit: new_chunk_sizes.append(original_chunk_size) else: divided_chunks = [] while original_chunk_size > 0: divided_chunks.append(min(original_chunk_size, chunk_size_limit)) original_chunk_size -= chunk_size_limit new_chunk_sizes.extend(divided_chunks) chunk_sizes = new_chunk_sizes print("%d chunks" % (len(chunk_sizes),)) prev = None fragment_len = 0 for chunk_size in chunk_sizes: t0 = time.time() prev = new_fragment._append_to_fragment( prev, fragment_len, str(self.__rec.seq[fragment_len : fragment_len + chunk_size]), ) fragment_len += chunk_size print("add chunk to fragment: %.4f\r" % (time.time() - t0,), end="") return new_fragment