def load_unique_vntrs_data(): vntrs = [] db_file = settings.TRAINED_MODELS_DB db = sqlite3.connect(db_file) cursor = db.cursor() cursor.execute( '''SELECT id, nonoverlapping, chromosome, ref_start, gene_name, annotation, pattern, left_flanking, right_flanking, repeats, scaled_score FROM vntrs''') for row in cursor: new_row = [] for element in row: if type(element) != int and type(element) != float: new_row.append(str(element)) else: new_row.append(element) vntr_id, overlap, chrom, start, gene, annotation, pattern, left_flank, right_flank, segments, score = new_row repeat_segments = segments.split(',') repeats = len(repeat_segments) vntr = ReferenceVNTR(int(vntr_id), pattern, int(start), chrom, gene, annotation, repeats, scaled_score=score) vntr.init_from_xml(repeat_segments, left_flank, right_flank) vntr.non_overlapping = True if overlap == 'True' else False vntrs.append(vntr) return vntrs
def load_unprocessed_vntrseek_data(vntrseek_output, chromosome=None): vntrs = [] genes_info = get_genes_info() with open(vntrseek_output) as input_file: input_lines = [ line.strip() for line in input_file.readlines() if line.strip() != '' ] for vntr_id, line in enumerate(input_lines): vntrseek_repeat, _, pattern, chromosome_number, start = line.split( ) if len(pattern) > 100: continue start = int(start) - 1 estimated_repeats = int(float(vntrseek_repeat) + 5) if chromosome is not None and chromosome_number != chromosome: continue estimated_end = estimated_repeats * len(pattern) + start if not is_vntr_close_to_gene(genes_info, chromosome_number, start, estimated_end): continue vntrs.append( ReferenceVNTR(vntr_id, pattern, start, chromosome_number, None, None, estimated_repeats)) print('%s VNTRs are close to a gene' % len(vntrs)) return vntrs
def add_model(args, addmodel_parser): if not args.reference: print_error(addmodel_parser, '--reference is required') if not args.chromosome: print_error(addmodel_parser, '--chromosome is required') if not args.pattern: print_error(addmodel_parser, '--pattern is required') if not args.start: print_error(addmodel_parser, '--start is required') if not args.end: print_error(addmodel_parser, '--end is required') chromosome = args.chromosome chr_sequence = '' fasta_sequences = SeqIO.parse(open(args.reference), 'fasta') for fasta in fasta_sequences: name, sequence = fasta.id, str(fasta.seq) if name == chromosome: chr_sequence = sequence if args.models is not None: settings.TRAINED_MODELS_DB = args.models if not os.path.exists(settings.TRAINED_MODELS_DB): create_vntrs_database(settings.TRAINED_MODELS_DB) vntr_id = get_largest_id_in_database() + 1 estimated_repeats = int((args.end - args.start) / len(args.pattern) + 5) ref_vntr = ReferenceVNTR(vntr_id, args.pattern, args.start, chromosome, None, None, estimated_repeats, chr_sequence) ref_vntr.init_from_vntrseek_data() vntr_finder = VNTRFinder(ref_vntr) print( 'Searching reference genome for regions with shared kmers with VNTR. It takes a few hours for human genome' ) scaled_recruitment_score = vntr_finder.train_classifier_threshold( args.reference) ref_vntr.scaled_score = scaled_recruitment_score save_reference_vntr_to_database(ref_vntr) print('Training completed. VNTR saved with ID: %s to the database' % vntr_id)
def get_reference_vntr(self, ru_count=10): pattern = 'ACGTACGT' ref_vntr = ReferenceVNTR(1, pattern, 1000, 'chr1', None, None) ref_vntr.repeat_segments = [pattern] * ru_count return ref_vntr
def get_reference_vntr(self): ref_vntr = ReferenceVNTR(1, 'CACA', 1000, 'chr1', None, None) return ref_vntr