def __post_init__(self): file_name = os.path.basename(self.rna_path) file_prefix = StrConverter.extract_file_name(file_name) self.result_path = os.path.join(self.output_directory, '%s_extract_result.txt' % file_prefix) self.gene_reader = GeneFileReader(self.data_path) self.headers = {} self.inv_headers = []
class GeneRangeExtract: data_path: str output_directory: str def __post_init__(self): file_name = os.path.basename(self.data_path) file_prefix = StrConverter.extract_file_name(file_name) self.result_path = os.path.join(self.output_directory, '%s_range_result.txt' % file_prefix) self.gene_reader = GeneFileReader(self.data_path) def generate_header(self, items): for idx, col_name in enumerate(items.strip().split('\t')): self.headers[col_name] = idx self.inv_headers.append(col_name) def run(self): self.gene_reader.build_information() with open(self.result_path, 'w', encoding='utf8') as fw: last_end = 0 region_idx = 0 fw.write('name\trange\tlocus_tag\n') for gene_idx, gene_segment in enumerate(self.gene_reader.gene_segments): left, right = gene_segment.cds if last_end < left - 1: region_idx += 1 fw.write('region_%d\t%d-%d\n' % (region_idx, last_end + 1, left - 1)) fw.write('gene_%d\t%d-%d\t%s\n' % (gene_idx + 1, left, right, gene_segment.locus_tag)) last_end = right total_len = len(self.gene_reader.dna_code) if last_end < total_len: region_idx += 1 fw.write('region_%d\t%d-%d\n' % (region_idx, last_end + 1, total_len))
def analysis_download_file(download_file_path, inter): left = min(inter) right = max(inter) gene_info = GeneFileReader(download_file_path) if not gene_info.build_information(): return False, None near_small = None near_big = None res_set = set() for idx, gene_segment in enumerate(gene_info.gene_segments): if gene_segment.cds[1] <= left: if not near_small or near_small.cds[1] < gene_segment.cds[1]: near_small = gene_segment if gene_segment.cds[0] >= right: if not near_big or near_big.cds[0] > gene_segment.cds[0]: near_big = gene_segment if gene_segment.cds[0] <= left <= gene_segment.cds[1]: res_set.add(str(gene_segment)) if gene_segment.cds[0] <= right <= gene_segment.cds[1]: res_set.add(str(gene_segment)) if near_small: res_set.add(near_small) if near_big: res_set.add(near_big) sequence = gene_info.dna_code[left - 1:right] if inter[0] > inter[1]: sequence = get_opposite_dna(sequence[::-1]) return True, { 'source': gene_info.source, 'data': list(res_set), 'sequence': sequence }
def test_gene_data_reader(self): input_path = os.path.join(self.download_directory, 'NC_000913.3.txt') gene_data_reader = GeneFileReader(input_path) gene_data_reader.build_information() self.assertTrue(len(gene_data_reader.gene_segments) > 0) with open(os.path.join(self.data_directory, 'gene_all.txt'), 'w', encoding='utf8') as fw: for gene_segment in gene_data_reader.gene_segments: if gene_segment.gene is not None: fw.write(gene_segment.gene + '\n')
def __post_init__(self): self.inter_path = self.input_path if self.mode == 'inter' else None self.rna_path = self.input_path if self.mode == 'rna' else None file_name = os.path.basename(self.input_path) file_prefix = StrConverter.extract_file_name(file_name) suffix = 'stream_%d' % self.limit if self.mode == 'rna' else 'gene' self.result_path = os.path.join( self.output_directory, '%s_%s_result.txt' % (file_prefix, suffix)) self.gene_reader = GeneFileReader(self.data_path) self.logger = LoggerFactory() self.headers = {} self.inv_headers = []
def __post_init__(self): self.data_name = os.path.basename(self.data_path) file_name = os.path.basename(self.gene_path) file_prefix = StrConverter.extract_file_name(file_name) self.result_path = os.path.join(self.output_directory, '%s_match_result.txt' % (file_prefix)) self.gene_reader = GeneFileReader(self.data_path) self.dna_code = None self.rev_dna_code = None self.logger = LoggerFactory() self.lock = threading.Lock() self.solved = 0 self.total = 0 self.weighted_sum = sum(self.weighted) assert self.weighted_sum > 0 and len(self.weighted) == 5
class GeneSimilarityMatch: gene_path: str data_path: str output_directory: str top_k: int = 20 candidate_distance: int = 5 batch_size: int = 5 patience: int = 0 weighted: List[int] = field(default_factory=list) conditions: dict = None continuous_mismatch_limit: int = None order_type: OrderType = OrderType.Decrement dna_code = None rev_dna_code = None gene_name_filter = None def __post_init__(self): self.data_name = os.path.basename(self.data_path) file_name = os.path.basename(self.gene_path) file_prefix = StrConverter.extract_file_name(file_name) self.result_path = os.path.join(self.output_directory, '%s_match_result.txt' % (file_prefix)) self.gene_reader = GeneFileReader(self.data_path) self.dna_code = None self.rev_dna_code = None self.logger = LoggerFactory() self.lock = threading.Lock() self.solved = 0 self.total = 0 self.weighted_sum = sum(self.weighted) assert self.weighted_sum > 0 and len(self.weighted) == 5 def run(self, gene_name_filter: GeneLocationAnalysis = None): self.gene_name_filter = gene_name_filter self.gene_reader.build_information() self.dna_code = self.gene_reader.dna_code self.rev_dna_code = get_opposite_dna(self.gene_reader.dna_code[::-1]) with open(self.result_path, 'w', encoding='utf8') as fw: gene_sequences = open(self.gene_path, 'r', encoding='utf8').readlines()[1:] self.solved = 0 self.total = len(self.gene_reader.dna_code) * len(gene_sequences) * 2 self.logger.info_with_expire_time( 'Doing Similarity Matching: %d/%d(%.2f%%)' % ( self.solved, self.total, self.solved * 100.0 / self.total), self.solved, self.total) pending_tasks = deque() running_tasks = [] for gene_sequence in gene_sequences: items = gene_sequence.strip().split('\t') name, gene = items[0], items[1].lower() t = threading.Thread(target=self.find_candidate_for_gene, args=(name, gene, fw,)) pending_tasks.append(t) while len(pending_tasks) > 0: running_tasks = [t for t in running_tasks if t.isAlive()] while len(running_tasks) < self.batch_size and len(pending_tasks) > 0: t = pending_tasks.popleft() t.start() running_tasks.append(t) time.sleep(10) for t in running_tasks: t.join() def find_candidate_for_gene(self, name, gene, fw): t1 = HasReturnThread(func=self.match_gene, args=(name, gene, self.dna_code, False,)) t1.start() t2 = HasReturnThread(func=self.match_gene, args=(name, gene, self.rev_dna_code, True,)) t2.start() t1.join() t2.join() candidates = t1.get_result() + t2.get_result() candidates = list(candidates) candidates.sort(key=lambda arg: -arg.weighted_similarity) if self.order_type == OrderType.Increment: for candidate in candidates: candidate.weighted_similarity = -candidate.weighted_similarity results = self.render_similarity_for_candidates(gene, candidates[:self.top_k]) self.lock.acquire() idx = 1 headers = [ 'name', 'direction', 'weighted_similarity' ] for idx, similarity_name in enumerate( ['text_distance_similarity', 'direct_match_similarity', 'consistency_similarity', 'pattern_similarity', 'blat_similarity']): if self.weighted[idx] > 0: headers.append(similarity_name) headers.append('original :') sequence_headers = [ 'gene_format :', 'target_format :', 'match_format :'] for candidate_result in results: candidate = candidate_result[0] fw.write('(%d)\n' % idx) attribute = { 'name': name, 'direction': '-' if candidate.is_reverse else '+', 'weighted_similarity': '%.2f' % candidate.weighted_similarity, 'original :': gene } for idx, similarity_name in enumerate( ['text_distance_similarity', 'direct_match_similarity', 'consistency_similarity', 'pattern_similarity', 'blat_similarity']): if self.weighted[idx] > 0: attribute[similarity_name] = '%.2f' % candidate.similarity_dict[ MatchAlgorithm.get_match_algorithm_by_name(similarity_name)] sequence_content = [] offset = 1 for idx, match_algorithm in enumerate(MatchAlgorithm.get_all_items()): if self.weighted[idx] > 0: for sequence_header, value in zip(sequence_headers, candidate_result[offset:offset + 3]): value = ''.join(value) sequence_content.append(match_algorithm.name + "_" + sequence_header + '=' + value) offset += 3 fw.write('>%s/%s-%s\t%s,%s\n' % ( self.data_name.replace(".txt", ''), candidate.start, candidate.end, ','.join(['%s=%s' % (key, attribute[key]) for key in headers if key in attribute]), ','.join(sequence_content) )) fw.write('\n') idx += 1 self.lock.release() def match_gene(self, name, gene, database, is_reverse): candidates: List[MatchCandidate] = [] gene_length = len(gene) min_weighted_similarity_in_candidates = 0.0 database_length = len(database) limitation = database_length - gene_length + 1 new_solved = 0 similarity_heap = [] buff = deque() match_pattern = MatchPattern(gene, self.conditions) if self.conditions else None for start in range(limitation): weighted_similarity, similarity_dict = count_similarity(weighted=self.weighted, gene=gene, database=database, offset=start, is_reverse=is_reverse, max_patience=self.patience, match_pattern=match_pattern, continuous_mismatch_limit=self.continuous_mismatch_limit, gene_name_filter=self.gene_name_filter) if self.order_type == OrderType.Increment: weighted_similarity = -weighted_similarity new_candidate = MatchCandidate( left=start, right=start + gene_length - 1, is_reverse=is_reverse, database_length=database_length, weighted_similarity=weighted_similarity, similarity_dict=similarity_dict) added_flag = update_candidate_list(new_candidate, buff, candidates, self.candidate_distance) if added_flag: heapq.heappush(similarity_heap, candidates[-1]) if len(similarity_heap) > self.top_k: heapq.heappop(similarity_heap) top = similarity_heap[0] min_weighted_similarity_in_candidates = max(min_weighted_similarity_in_candidates, top.weighted_similarity) new_solved += 1 if random.random() * 1000 < 1: self.lock.acquire() self.solved += new_solved self.logger.info_with_expire_time( 'Doing Similarity Matching for %s[%s]: %d/%d(%.2f%%) ' '--top_k=%d ' '--top_similarity_info=[%s] ' '--gene_length=%d ' '--candidates_num=%d' % ( name, '-' if is_reverse else '+', self.solved, self.total, self.solved * 100.0 / self.total, self.top_k, similarity_heap[0].get_similarity_str() if len(similarity_heap) > 0 else 'None', gene_length, len(candidates) ), self.solved, self.total) self.lock.release() new_solved = 0 if len(candidates) > CandidateClearSize: candidates.sort(key=lambda arg: -arg.weighted_similarity) candidates = candidates[:self.top_k] while len(buff) > 0: update_candidate_list(None, buff, candidates, 1) self.lock.acquire() self.solved += new_solved + gene_length - 1 self.lock.release() return candidates def render_similarity_for_candidates(self, gene, candidates): result = [] for candidate in candidates: database = self.rev_dna_code if candidate.is_reverse else self.dna_code candidate_result = [candidate] for idx, match_algorithm in enumerate(MatchAlgorithm.get_all_items()): if self.weighted[idx] > 0: candidate_result.extend( self.render_target_dna_sequence(match_algorithm, gene, database, candidate.original_match_left)) result.append(candidate_result) return result def render_target_dna_sequence(self, match_algorithm: MatchAlgorithm, gene, database, offset): sequence_gene = [] sequence_target = [] sequence = [] tot = len(gene) if match_algorithm == MatchAlgorithm.text_distance: score, dp = compute_text_distance_similarity(gene, database, offset) i, j = tot, tot while i > 0 or j > 0: gene_a, gene_b = gene[i - 1] if i > 0 else '.', database[j + offset - 1] if j > 0 else '.' if i > 0 and j > 0 and dp[i][j] == dp[i - 1][j - 1] + should_change(gene[i - 1], database[j + offset - 1]): sequence_gene.append(gene_a) sequence_target.append(gene_b) sequence.append('*' if should_change(gene[i - 1], database[j + offset - 1]) == 0 else '.') i, j = i - 1, j - 1 elif dp[i][j] == dp[i - 1][j] + 1: sequence_gene.append(gene_a) sequence_target.append('.') sequence.append('.') i -= 1 elif dp[i][j] == dp[i][j - 1] + 1: sequence_gene.append('.') sequence_target.append(gene_b) sequence.append('.') j -= 1 else: raise ValueError('Should not go here!') sequence_gene.reverse() sequence_target.reverse() sequence.reverse() elif match_algorithm == MatchAlgorithm.direct_match: for i in range(tot): sequence_gene.append(gene[i]) sequence_target.append(database[i + offset]) if not should_change(gene[i], database[i + offset]): sequence.append('*') else: sequence.append('.') elif match_algorithm == MatchAlgorithm.consistency: score, score_queue, score_merge_idx = compute_consistency_similarity(gene, database, offset, self.patience) sequence_gene.extend(gene[:]) sequence_target.extend(database[offset:offset + tot]) cur_pos = 0 for idx, (same_cnt, same_end) in enumerate(score_queue): same_start = same_end - same_cnt while cur_pos < same_start: if score_merge_idx[0] < idx <= score_merge_idx[1]: sequence.append('-') else: sequence.append('.') cur_pos += 1 while cur_pos < same_end: sequence.append('*') cur_pos += 1 while cur_pos < tot: sequence.append('.') cur_pos += 1 elif match_algorithm == MatchAlgorithm.pattern: for i in range(tot): sequence_gene.append(gene[i]) sequence_target.append(database[i + offset]) if not should_change(gene[i], database[i + offset]): sequence.append('*') else: sequence.append('.') elif match_algorithm == MatchAlgorithm.blat: flag, pos_data_end = compute_blat_similarity(gene, database, offset) pos_data = offset pos_gene = 0 while pos_gene < 4: if should_change(gene[pos_gene], database[pos_data]) > 0: sequence_gene.append('-') sequence_target.append(database[pos_data]) sequence.append('.') pos_data += 1 else: sequence_gene.append(gene[pos_gene]) sequence_target.append(database[pos_data]) sequence.append('*') pos_gene += 1 pos_data += 1 rev_pos_gene = 7 rev_pos_data = pos_data_end - 1 rev_sequence_gene = [] rev_sequence_target = [] rev_sequence = [] while rev_pos_gene > 3: if should_change(gene[rev_pos_gene], database[rev_pos_data]) > 0: rev_sequence_gene.append('-') rev_sequence_target.append(database[rev_pos_data]) rev_sequence.append('.') rev_pos_data -= 1 else: rev_sequence_gene.append(gene[rev_pos_gene]) rev_sequence_target.append(database[rev_pos_data]) rev_sequence.append('*') rev_pos_gene -= 1 rev_pos_data -= 1 while pos_data <= rev_pos_data: sequence_gene.append('-') sequence_target.append(database[pos_data]) sequence.append('.') pos_data += 1 sequence_gene.extend(rev_sequence_gene[::-1]) sequence_target.extend(rev_sequence_target[::-1]) sequence.extend(rev_sequence[::-1]) return sequence_gene, sequence_target, sequence
class GeneExtract: data_path: str rna_path: str output_directory: str gene_extract_based: str = 'gene' left_idx: int = -2 right_idx: int = -1 def __post_init__(self): file_name = os.path.basename(self.rna_path) file_prefix = StrConverter.extract_file_name(file_name) self.result_path = os.path.join(self.output_directory, '%s_extract_result.txt' % file_prefix) self.gene_reader = GeneFileReader(self.data_path) self.headers = {} self.inv_headers = [] def generate_header(self, items): for idx, col_name in enumerate(items.strip().split('\t')): self.headers[col_name] = idx self.inv_headers.append(col_name) def run(self): self.gene_reader.build_information() dna_code = self.gene_reader.dna_code with open(self.result_path, 'w', encoding='utf8') as fw: if self.gene_extract_based == 'gene': self.extract_sequence_based_on_gene(dna_code, fw) elif self.gene_extract_based == 'range': self.extract_sequence_based_on_range(dna_code, fw) def extract_sequence_based_on_gene(self, dna_code, fw): fw.write('No\tgene\tfrom\t\tend\tproduct\tsequence\n') for gene_idx, gene in enumerate(open(self.rna_path)): gene = gene.strip() succ = False for idx in self.gene_reader.gene_name_segment_map.get(gene, []): gene_segment = self.gene_reader.gene_segments[idx] succ = True start = gene_segment.cds[0] end = gene_segment.cds[1] product = gene_segment.product sequence = dna_code[start - 1:end] fw.write('d%d\t%s\t%s\t%s\t%s\t%s\n' % ( gene_idx + 1, gene, start, end, product, sequence)) if not succ: print('%s not found in %s' % (gene, self.data_path)) def extract_sequence_based_on_range(self, dna_code, fw): lines = [line.strip() for line in open(self.rna_path, 'r', encoding='utf8')] self.generate_header(lines[0]) fw.write(lines[0] + '\n') for line in lines[1:]: result = {} infos = line.strip().split('\t') for idx, info in enumerate(infos): result[self.inv_headers[idx]] = info if result.get('sequence', '') == '': try: a, b = map(int, [infos[self.left_idx], infos[self.right_idx]]) left = min(a, b) right = max(a, b) direction = a < b # id start from 0 left -= 1 right -= 1 if not direction: left += 1 right += 1 dna = dna_code[left:right] if not direction: result['sequence'] = get_opposite_dna(dna[::-1]) else: result['sequence'] = dna except: print(infos) traceback.print_exc() fw.write(self.extract_output(result) + '\n') def extract_output(self, result): output = [] for name in self.inv_headers: output.append(result.get(name, '')) return '\t'.join(output)
class GeneStreamAnalysis: data_path: str input_path: str output_directory: str mode: str = 'rna' limit: int = 200 def __post_init__(self): self.inter_path = self.input_path if self.mode == 'inter' else None self.rna_path = self.input_path if self.mode == 'rna' else None file_name = os.path.basename(self.input_path) file_prefix = StrConverter.extract_file_name(file_name) suffix = 'stream_%d' % self.limit if self.mode == 'rna' else 'gene' self.result_path = os.path.join( self.output_directory, '%s_%s_result.txt' % (file_prefix, suffix)) self.gene_reader = GeneFileReader(self.data_path) self.logger = LoggerFactory() self.headers = {} self.inv_headers = [] def get_utr_between(self, first, second): left = self.gene_reader.gene_segments[first].cds[1] right = self.gene_reader.gene_segments[second].cds[0] - 1 return self.gene_reader.dna_code[left:right] def work_for_gene_index(self, index, start, end): gene_segment = self.gene_reader.gene_segments[index] assert gene_segment.cds[0] == min(start, end) assert gene_segment.cds[1] == max(start, end) seq = self.gene_reader.dna_code[gene_segment.cds[0] - 1:gene_segment.cds[1]] upstream = self.gene_reader.dna_code[ max(gene_segment.cds[0] - self.limit - 1, 0):gene_segment.cds[0] - 1] downstream = self.gene_reader.dna_code[gene_segment. cds[1]:gene_segment.cds[1] + self.limit] if start > end: seq = get_opposite_dna(seq[::-1]) upstream, downstream = get_opposite_dna( downstream[::-1]), get_opposite_dna(upstream[::-1]) return seq, upstream, downstream def work_for_gene(self, gene_idx, gene_name, start, end, fw): if gene_name.find('->') >= 0: gene_name = gene_name[:gene_name.index('->')] if gene_name not in self.gene_reader.gene_name_segment_map: self.logger.info("%s not found in data" % gene_name) return cnt = 1 fw.write('%d. %s\n' % (gene_idx, gene_name)) for idx in self.gene_reader.gene_name_segment_map[gene_name]: seq, up, down = self.work_for_gene_index(idx, start, end) fw.write('%d)\n' % cnt) fw.write('position\t%d %s %d\n' % (self.gene_reader.gene_segments[idx].cds[0], '->' if start < end else '<-', self.gene_reader.gene_segments[idx].cds[1])) fw.write('product\t%s\n' % self.gene_reader.gene_segments[idx].product) fw.write('GeneID\t%s\n' % self.gene_reader.gene_segments[idx].gene_id) fw.write('stream\t%s\n' % seq) if up: fw.write('upstream\t%s\n' % up) if down: fw.write('downstream\t%s\n' % down) fw.write('\n') cnt += 1 def check_inter(self, fw): for line in open(self.inter_path, 'r', encoding='utf8'): line = line.strip() if line == '': continue left, right = map(int, line.split(',')) up, down = None, None for gene_segment in self.gene_reader.gene_segments: if max(gene_segment.cds) < left: if not up or max(up.cds) < max(gene_segment.cds): up = gene_segment if min(gene_segment.cds) > right: if not down or min(down.cds) > min(gene_segment.cds): down = gene_segment fw.write('%s:\n' % line) if up: fw.write('up-gene\t%s\nup-position\t%s\nup-product\t%s\n' % (up.gene, '-'.join(map(str, up.cds)), up.product)) if down: fw.write( 'down-gene\t%s\ndown-position\t%s\ndown-product\t%s\n' % (down.gene, '-'.join(map(str, down.cds)), down.product)) fw.write('\n') def generate_header(self, items): for idx, col_name in enumerate(items.strip().split('\t')): self.headers[col_name] = idx self.inv_headers.append(col_name) def run(self): self.gene_reader.build_information() with open(self.result_path, 'w', encoding='utf8') as fw: if self.mode == 'rna': lines = open(self.rna_path, 'r', encoding='utf8').readlines() self.generate_header(lines[0]) for gene_idx, line in enumerate(lines[1:]): items = line.split('\t') gene_name, start, end = items[self.headers['gene']], int( items[self.headers['map_start_pos']]), int( items[self.headers['map_end_pos']]) self.work_for_gene(gene_idx, gene_name.strip(), start, end, fw) elif self.mode == 'inter': self.check_inter(fw) else: raise ValueError(self.mode)