示例#1
0
def load_unique_vntrs_data():
    vntrs = []
    db_file = settings.TRAINED_MODELS_DB
    db = sqlite3.connect(db_file)
    cursor = db.cursor()
    cursor.execute(
        '''SELECT id, nonoverlapping, chromosome, ref_start, gene_name, annotation, pattern, left_flanking,
    right_flanking, repeats, scaled_score FROM vntrs''')

    for row in cursor:
        new_row = []
        for element in row:
            if type(element) != int and type(element) != float:
                new_row.append(str(element))
            else:
                new_row.append(element)
        vntr_id, overlap, chrom, start, gene, annotation, pattern, left_flank, right_flank, segments, score = new_row
        repeat_segments = segments.split(',')
        repeats = len(repeat_segments)
        vntr = ReferenceVNTR(int(vntr_id),
                             pattern,
                             int(start),
                             chrom,
                             gene,
                             annotation,
                             repeats,
                             scaled_score=score)
        vntr.init_from_xml(repeat_segments, left_flank, right_flank)
        vntr.non_overlapping = True if overlap == 'True' else False
        vntrs.append(vntr)

    return vntrs
示例#2
0
def load_unprocessed_vntrseek_data(vntrseek_output, chromosome=None):
    vntrs = []
    genes_info = get_genes_info()
    with open(vntrseek_output) as input_file:
        input_lines = [
            line.strip() for line in input_file.readlines()
            if line.strip() != ''
        ]
        for vntr_id, line in enumerate(input_lines):
            vntrseek_repeat, _, pattern, chromosome_number, start = line.split(
            )
            if len(pattern) > 100:
                continue
            start = int(start) - 1
            estimated_repeats = int(float(vntrseek_repeat) + 5)
            if chromosome is not None and chromosome_number != chromosome:
                continue
            estimated_end = estimated_repeats * len(pattern) + start
            if not is_vntr_close_to_gene(genes_info, chromosome_number, start,
                                         estimated_end):
                continue
            vntrs.append(
                ReferenceVNTR(vntr_id, pattern, start, chromosome_number, None,
                              None, estimated_repeats))
    print('%s VNTRs are close to a gene' % len(vntrs))
    return vntrs
示例#3
0
def add_model(args, addmodel_parser):
    if not args.reference:
        print_error(addmodel_parser, '--reference is required')
    if not args.chromosome:
        print_error(addmodel_parser, '--chromosome is required')
    if not args.pattern:
        print_error(addmodel_parser, '--pattern is required')
    if not args.start:
        print_error(addmodel_parser, '--start is required')
    if not args.end:
        print_error(addmodel_parser, '--end is required')

    chromosome = args.chromosome
    chr_sequence = ''

    fasta_sequences = SeqIO.parse(open(args.reference), 'fasta')
    for fasta in fasta_sequences:
        name, sequence = fasta.id, str(fasta.seq)
        if name == chromosome:
            chr_sequence = sequence

    if args.models is not None:
        settings.TRAINED_MODELS_DB = args.models
    if not os.path.exists(settings.TRAINED_MODELS_DB):
        create_vntrs_database(settings.TRAINED_MODELS_DB)

    vntr_id = get_largest_id_in_database() + 1
    estimated_repeats = int((args.end - args.start) / len(args.pattern) + 5)
    ref_vntr = ReferenceVNTR(vntr_id, args.pattern, args.start, chromosome,
                             None, None, estimated_repeats, chr_sequence)
    ref_vntr.init_from_vntrseek_data()
    vntr_finder = VNTRFinder(ref_vntr)

    print(
        'Searching reference genome for regions with shared kmers with VNTR. It takes a few hours for human genome'
    )
    scaled_recruitment_score = vntr_finder.train_classifier_threshold(
        args.reference)
    ref_vntr.scaled_score = scaled_recruitment_score
    save_reference_vntr_to_database(ref_vntr)
    print('Training completed. VNTR saved with ID: %s to the database' %
          vntr_id)
 def get_reference_vntr(self, ru_count=10):
     pattern = 'ACGTACGT'
     ref_vntr = ReferenceVNTR(1, pattern, 1000, 'chr1', None, None)
     ref_vntr.repeat_segments = [pattern] * ru_count
     return ref_vntr
示例#5
0
 def get_reference_vntr(self):
     ref_vntr = ReferenceVNTR(1, 'CACA', 1000, 'chr1', None, None)
     return ref_vntr