示例#1
0
    def __init__(self, entries, reference_path=None, query_path=None):
        # self.source = dict()
        # self.contigs = dict()
        self.r_contigs = dict()
        self.q_contigs = dict()
        if reference_path is not None:
            self.reference_name = reference_path.split('/')[-1]
            self.reference = pyfaidx.Faidx(reference_path)
        if query_path is not None:
            self.query_name = query_path.split('/')[-1]
            self.query = pyfaidx.Faidx(query_path)

        for entry in entries:
            # check if the ref contig has been seen before
            ref_tree = self.r_contigs.get(entry.r_tag)
            if not ref_tree:
                ref_tree = ival.IntervalTree()
                self.r_contigs[entry.r_tag] = ref_tree
                # self.source[entry.r_tag] = 'r'
            # put the entry in the reference interval tree for the appropriate contig
            iv = ival.Interval(entry.s1_start, entry.s1_end)
            ref_tree.put(iv, entry)

            # check if the query contig has been seen before
            query_tree = self.q_contigs.get(entry.q_tag)
            if not query_tree:
                query_tree = ival.IntervalTree()
                self.q_contigs[entry.q_tag] = query_tree
                # self.source[entry.q_tag] = 'q'
            # put the entry in the interval tree for the appropriate contig
            iv = ival.Interval(entry.s2_start, entry.s2_end)
            query_tree.put(iv, entry)
def main():

    if refFile == None:
        man.manpage()
        sys.exit()

    else:

        basename = str(os.path.basename(modelName))
        os.symlink(refFile, basename)
        pyfaidx.Faidx(basename)
        cwd = os.getcwd()
        indexFile = ''
        for file in os.listdir(cwd):
            if file.endswith('.fai'):
                indexFile = (os.path.join('.', file))

        ref_index = parseIndexRef(indexFile)
        model = create_model(ref_index)
        records = model.to_records(index=False)

        outf = modelName + '_p.gz'
        output = gzip.open(outf, 'wb')
        pickle.dump(records, output)

        os.remove(basename)
        os.remove(indexFile)
示例#3
0
def twoSampleBuildContigs(assemblyCoordsFile, queryAlignment,
                          binCoordsDirectory, outputDirectory, minLength,
                          minMatch, minCov, simple):
    binCoords = glob.glob(binCoordsDirectory + "/*")
    assembly_coords = coords.readCoordFile(assemblyCoordsFile)
    query_alignment = coords.readCoordFile(queryAlignment, readFastas=False)

    try:
        os.mkdir(outputDirectory)
    except FileExistsError:
        print("Overwriting existing files")

    for file in binCoords:
        if file.endswith(".fna") or file.endswith(".fa"):
            print("Working on: ", file)
            oldBin = pyfaidx.Faidx(file)
            new_contigs = buildContigs(assembly_coords, query_alignment,
                                       oldBin, simple, outputDirectory,
                                       minLength, minMatch, minCov)
            with open(outputDirectory + "/new_" + file.split('/')[-1],
                      'w') as fh:

                for contig in oldBin.index.keys():
                    seq = oldBin.fetch(contig, 1, oldBin.index[contig].rlen)
                    fasta = ">" + seq.name + '\n'
                    fasta += seq.seq + '\n'
                    fh.write(fasta)
                for contig in new_contigs.keys():
                    fasta = '>' + contig + '\n'
                    fasta += new_contigs[contig].seq
                    fasta += '\n'
                    fh.write(fasta)

    print("done!")
    sys.exit()
def main():

    basename = str(os.path.basename(modelName))
    os.symlink(refdatabase, basename)
    pyfaidx.Faidx(basename)
    print('reading reference file: ' + str(refdatabase) + "\n")
    print('Indexing reference file....' + "\n")

    cwd = os.getcwd()
    indexFile = ''
    for file in os.listdir(cwd):
        if file.endswith('.fai'):
            indexFile = (os.path.join('.', file))

    refindex = parseIndexRef(indexFile)
    df = createfctable(table)
    print('reading count matrix: ' + str(table))
    background_model = create_model(refindex, df[0], 'background')
    control_model = create_model(refindex, df[1], 'control')
    experiment_model = create_model(refindex, df[2], 'experiment')
    control = control_model.to_records(index=False)
    experiment = experiment_model.to_records(index=False)
    background = background_model.to_records(index=False)
    records = [background, control, experiment]
    print('writing model to disk... ')
    outf = modelName + '.p'
    output = open(outf, 'wb')
    pickle.dump(records, output)

    os.remove(basename)
    os.remove(indexFile)
    print('Model generation is complete')
    end_time = datetime.now()
    print('Duration: {}'.format(end_time - start_time))
示例#5
0
def bin_contigs(bin_dict, assembly_file, output='bin.'):
    assembly = pyfaidx.Faidx(assembly_file)
    for (bin, contigs) in bin_dict.items():
        with open(output + str(bin + 2) + '.fna', 'w') as f:
            for contig in contigs:
                seq = assembly.fetch(contig, 1, assembly.index[contig].rlen)
                fasta = ">" + seq.name + '\n'
                fasta += seq.seq + '\n'
                f.write(fasta)
示例#6
0
文件: fa2wgs.py 项目: zorrodong/PSiTE
def build_fai(fasta=None):
    '''
    In order to handle exceptions in child process--pyfaidx.Faidx,
    I must use the mothod result.get().
    But just using pyfaidx.Faidx in apply_async will induce error as there is no return value of pyfaidx.Faidx.
    So I just wrapper the function here and add a string as the return value.
    '''
    pyfaidx.Faidx(fasta)
    return 'Built index for {}'.format(fasta)
示例#7
0
文件: fa2wgs.py 项目: icelu/CSiTE
def genomesize(fasta=None):
    '''
    Extract genome size from .fa file.
    '''
    fa = pyfaidx.Faidx(fasta)
    gsize = 0
    for chroms in fa.index.keys():
        gsize += fa.index[chroms].rlen
    return gsize
示例#8
0
    def __init__(self, fa_path, oligo_length, n_oligos_per_target,
                 anneal_length, annealing_filters, cushion_length):

        self.faidx = pyfaidx.Faidx(fa_path)
        self.olg_len = oligo_length
        self.n_olg_per_tgt = n_oligos_per_target
        self.anl_len = anneal_length
        self.anl_filters = annealing_filters
        self.csh_len = cushion_length

        self.tot_len = oligo_length * n_oligos_per_target - anneal_length * (
            n_oligos_per_target - 1)
示例#9
0
文件: fa2wes.py 项目: zorrodong/PSiTE
def check_normal_fa(normal_dir):
    '''
    There must be one fasta file for each haplotype in the normal dir
    '''
    for parental in 0, 1:
        fasta = '{}/normal.parental_{}.fa'.format(normal_dir, parental)
        if not os.path.isfile(fasta):
            raise argparse.ArgumentTypeError(
                'Cannot find normal.parental_{}.fa under directory: {}'.format(
                    parental, normal_dir))
        # Create index file (.fai) for each fasta
        fa = pyfaidx.Faidx(fasta)
示例#10
0
    def pyfaidx_faidx(n):
        print('timings for pyfaidx.Faidx')
        ti = []
        tf = []
        for _ in range(n):
            t = time.time()
            f = pyfaidx.Faidx(fa_file.name)
            ti.append(time.time() - t)

            t = time.time()
            read_faidx(f, headers)
            tf.append(time.time() - t)
            os.remove(index)
        # profile memory usage and report timings
        tracemalloc.start()
        f = pyfaidx.Faidx(fa_file.name)
        read_faidx(f, headers)
        os.remove(index)
        print(tracemalloc.get_traced_memory())
        print(mean(ti))
        print(mean(tf)/nreads/10*1000*1000)
        tracemalloc.stop()
示例#11
0
文件: fa2wes.py 项目: zorrodong/PSiTE
def check_tumor_fa(tumor_dir, sectors, simulator):
    '''
    Ensure the size of a chromsome is not too large for 'samtools index'.
    See https://github.com/samtools/htsjdk/issues/447 for the issues regarding large chromosomes.
    '''
    tipnodes = set()
    for sector in sectors:
        tipnodes = tipnodes.union(set(sectors[sector]['composition'].keys()))
    for tipnode in tipnodes:
        for parental in 0, 1:
            fasta = '{}/{}.parental_{}.fa'.format(tumor_dir, tipnode, parental)
            if not os.path.isfile(fasta):
                raise argparse.ArgumentTypeError(
                    'Cannot find {}.parental_{}.fa under directory: {}'.format(
                        tipnode, parental, tumor_dir))
            # Create index file (.fai) for each fasta
            fa = pyfaidx.Faidx(fasta)
            if (simulator == 'capgem'):
                for chroms in fa.index.keys():
                    chr_len = fa.index[chroms].rlen
                    if (chr_len > MAX_CHROM):
                        raise argparse.ArgumentTypeError(
                            'The size of chromsome {} ({}) for {} is larger than 512 M!'
                            .format(chroms, chr_len, fasta))
    'Rscript ~/Develop/MicroHomologyMediatedTandemDuplications/reference_genome_TRs/code/TR_MH_build_results_table.R',
    help='parse processed TRF tab file & find MHPs')
args = parser.parse_args()

# ftp://ftp.ensemblgenomes.org/pub/fungi/release-46/variation/vcf/schizosaccharomyces_pombe/schizosaccharomyces_pombe_incl_consequences.vcf.gz.csi
# ftp://ftp.ensemblgenomes.org/pub/fungi/release-46/fasta/schizosaccharomyces_pombe/dna/Schizosaccharomyces_pombe.ASM294v2.dna.toplevel.fa.gz
#  WORKDIR = '/Users/lcarey/Downloads/test/'
#  args.fasta = WORKDIR + 'Schizosaccharomyces_pombe.ASM294v2.dna.toplevel.fa.gz'
#  args.vcf   = WORKDIR + 'schizosaccharomyces_pombe_indels.vcf'
#nt_flank = 50
#cons = pyfaidx.FastaVariant( fasta_file_name , vcf_file_name , het=True , hom=True )

# intput and output files
fasta_output_filename = args.output_basename + '.fasta'
seqs_to_write = []  # empty list of seqs we'll save
fa = pyfaidx.Faidx(args.fasta)
vcf_reader = vcf.Reader(open(args.vcf, 'r'))

# for each record in the VCF file with an insertion length > MIN_LENGTH_INSERTION
#   generate the REF and each ALT seq
#   save these to a fasta file
for record in vcf_reader:
    print_me_flag = False
    for r in record.ALT:
        if len(r.sequence) >= args.MIN_LENGTH_INSERTION:
            print_me_flag = True
    if print_me_flag:
        try:
            left_seq = fa.fetch(record.CHROM, record.start - args.nt_flank,
                                record.start)
            right_seq = fa.fetch(record.CHROM, record.end + 1,
示例#13
0
def run(parser, args):
    '''
	Check arguments, run functions
	'''

    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
    print('[' + now + '][Message] VISOR HACk v1.1')

    #fill container

    c.OUT = os.path.abspath(args.output)
    c.REF = os.path.abspath(args.genome)
    c.BED = [os.path.abspath(x) for x in args.bedfile[0]]
    c.store = args.vcf  #but not yet used. Just for future reference

    #main

    if not os.path.exists(c.OUT):

        try:

            os.makedirs(c.OUT)

        except:

            now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
            print('[' + now + '][Error] Cannot create the output folder')
            sys.exit(1)

    else:

        if not os.access(os.path.abspath(c.OUT), os.W_OK):

            now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
            print('[' + now +
                  '][Error] Missing write permissions on the output folder')
            sys.exit(1)

        elif os.listdir(os.path.abspath(c.OUT)):

            now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
            print(
                '[' + now +
                '][Error] The output folder is not empty: specify another output folder or clean the current one'
            )
            sys.exit(1)

    if which('bedtools') is None:

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Error] bedtools must be in PATH')
        sys.exit(1)

    try:

        ref = pyfaidx.Fasta(c.REF)
        chrs = ref.keys()

    except:

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print(
            '[' + now +
            '][Error] Reference file does not exist, is not readable or is not a valid FASTA'
        )
        sys.exit(1)

    #accepted variants
    possible_variants = [
        'SNP', 'MNP', 'inversion', 'deletion', 'insertion',
        'tandem duplication', 'inverted tandem duplication',
        'perfect tandem repetition', 'approximate tandem repetition',
        'tandem repeat expansion', 'tandem repeat contraction',
        'reciprocal translocation', 'translocation cut-paste',
        'translocation copy-paste', 'interspersed duplication'
    ]
    valid_dna = 'ACGT'
    haplopattern = re.compile(
        "^h[0-9]+$"
    )  #allowed haplotypes for inter-haplotype variants (h1,h2,...)

    #this will contain variants for each haplotype
    d = dict()
    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
    print('[' + now + '][Message] Validating variants in BED')

    for i, bed in enumerate(c.BED):

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Message] Validating BED ' + bed)

        try:

            bedfile = pybedtools.BedTool(bed)
            bedsrtd = bedfile.sort()

        except:

            now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
            print('[' + now + '][Error] BED ' + bed +
                  ' does not exist, is not readable or is not a valid BED')
            sys.exit(1)

        d["h{0}".format(i + 1)] = dict(
        )  #one sub-dict for each BED/haplotype. This way of specifying different haplotypes works perfectly

        for j, x in enumerate(bedsrtd):

            if x.chrom not in chrs:

                now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                print(
                    '[' + now + '][Error] Line ' + str(j + 1) +
                    ': column 1 (chromosome name) contains an invalid chromosome (not included in the reference provided)'
                )
                sys.exit(1)

            if x.start <= 0 or x.start > len(ref[x.chrom]):

                now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                print(
                    '[' + now + '][Error] Line ' + str(j + 1) +
                    ': column 2 (chromosome start) contains an invalid coordinate (lower than chromosome start or greater than chromosome end)'
                )
                sys.exit(1)

            if x.end > len(ref[x.chrom]):  #this can't be 0 I guess

                now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                print(
                    '[' + now + '][Error] Line ' + str(j + 1) +
                    ': column 3 (chromosome end) contains an invalid coordinate (greater than chromosome end)'
                )
                sys.exit(1)

            #check if 4th field is a valid/supported variant

            if x[3] not in possible_variants:

                now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                print(
                    '[' + now + '][Error] Line ' + str(j + 1) +
                    ': column 4 (variant type) contains an unsupported variant type'
                )
                sys.exit(1)

            #check if 6th field can be converted to integer

            try:

                int(x[5])

            except:

                now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                print(
                    '[' + now + '][Error] Line ' + str(j + 1) +
                    ': column 6 (length of random sequence to insert at breakpoint) must contain an integer'
                )
                sys.exit(1)

            #now validate infos (5th field) for each variant

            if x[3] == 'SNP':

                if x[4] not in list(
                        valid_dna
                ):  #single base must be a valid base. This also checks for length greater than 1

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) contains an invalid DNA base'
                    )
                    sys.exit(1)

                if int(x[5]) != 0:  #no random sequence at breakpoint: not a SV

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Warning] Line ' + str(j + 1) +
                        ': column 6 (length of random sequence to insert at breakpoint) coherced to 0'
                    )

                if x.chrom not in d["h{0}".format(i + 1)].keys():  #store

                    d["h{0}".format(i + 1)][x.chrom] = [(x.start, x.end, x[3],
                                                         x[4], '')]

                else:

                    d["h{0}".format(i + 1)][x.chrom].append(
                        (x.start, x.end, x[3], x[4], ''))

            elif x[3] == 'MNP':

                if not all(y in list(valid_dna) for y in str(
                        x[4])):  #check that every base is a valid base

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ':  column 5 (variant information) contains an invalid DNA sequence'
                    )
                    sys.exit(1)

                if len(
                        str(x[4])
                ) != x.end - x.start + 1:  #check that the length of the sequence that has to be replaced mathces the length of a user-defined sequence

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ':  column 5 (variant information) contains a sequence shorter/longer than region'
                    )
                    sys.exit(1)

                if int(x[5]) != 0:  #no random sequence at breakpoint: not a SV

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Warning] Line ' + str(j + 1) +
                        ': column 6 (length of random sequence to insert at breakpoint) coherced to 0'
                    )

                if x.chrom not in d["h{0}".format(i + 1)].keys():  #store

                    d["h{0}".format(i + 1)][x.chrom] = [(x.start, x.end, x[3],
                                                         x[4], '')]

                else:

                    d["h{0}".format(i + 1)][x.chrom].append(
                        (x.start, x.end, x[3], x[4], ''))

            elif x[3] == 'inversion':

                if x[4] != 'None':  #tolerate not-None, as there is only one chance

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print('[' + now + '][Warning] Line ' + str(j + 1) +
                          ':  column 5 (variant information) coherced to None')

                randomseq = ''.join(random.choices(valid_dna, k=int(
                    x[5])))  #if x[5] equal to 0, this is empty anyway

                if x.chrom not in d["h{0}".format(i + 1)].keys():  #store

                    d["h{0}".format(i + 1)][x.chrom] = [(x.start, x.end, x[3],
                                                         x[4], randomseq)]

                else:

                    d["h{0}".format(i + 1)][x.chrom].append(
                        (x.start, x.end, x[3], x[4], randomseq))

            elif x[3] == 'deletion':

                if x[4] not in {
                        'None', '1bp'
                }:  #these are the accepted possibilities by now

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) contains an invalid istruction. Must be either "None" or "1bp"'
                    )
                    sys.exit[1]

                randomseq = ''.join(random.choices(valid_dna, k=int(x[5])))

                if x.chrom not in d["h{0}".format(i + 1)].keys():  #store

                    d["h{0}".format(i + 1)][x.chrom] = [(x.start, x.end, x[3],
                                                         x[4], randomseq)]

                else:

                    d["h{0}".format(i + 1)][x.chrom].append(
                        (x.start, x.end, x[3], x[4], randomseq))

            elif x[3] == 'insertion':

                if not (all(y in list(valid_dna) for y in x[4].upper())
                        ):  #sequence to insert must be a valid DNA string

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) contains an invalid DNA sequence'
                    )
                    sys.exit(1)

                randomseq = ''.join(random.choices(valid_dna, k=int(x[5])))

                if x.chrom not in d["h{0}".format(i + 1)].keys():  #store

                    d["h{0}".format(i + 1)][x.chrom] = [
                        (x.start, x.end, x[3], x[4].upper(), randomseq)
                    ]

                else:

                    d["h{0}".format(i + 1)][x.chrom].append(
                        (x.start, x.end, x[3], x[4].upper(), randomseq))

            elif x[3] == 'tandem duplication' or x[
                    3] == 'inverted tandem duplication':  #same checks for these variants

                try:

                    int(x[4])

                except:

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) must contain an integer'
                    )
                    sys.exit(1)

                randomseq = ''.join(random.choices(valid_dna, k=int(x[5])))

                if x.chrom not in d["h{0}".format(i + 1)].keys():  #store

                    d["h{0}".format(i + 1)][x.chrom] = [(x.start, x.end, x[3],
                                                         int(x[4]), randomseq)]

                else:

                    d["h{0}".format(i + 1)][x.chrom].append(
                        (x.start, x.end, x[3], int(x[4]), randomseq))

            elif x[3] == 'perfect tandem repetition':

                column5 = x[4].split(':')  #info must be in this format

                if len(column5) != 2:

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) must be in string:integer format'
                    )
                    sys.exit(1)

                if not (all(y in list(valid_dna) for y in column5[0].upper())
                        ):  #sequence must be a valid DNA string

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) contains an invalid DNA motif'
                    )
                    sys.exit(1)

                try:

                    int(column5[1])

                except:

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) must contain an integer specifying the number of repetitions'
                    )
                    sys.exit(1)

                if int(x[5]) != 0:  #no random sequence at breakpoint: not a SV

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Warning] Line ' + str(j + 1) +
                        ': column 6 (length of random sequence to insert at breakpoint) coherced to 0'
                    )

                if x.chrom not in d["h{0}".format(i + 1)].keys():  #store

                    d["h{0}".format(i + 1)][x.chrom] = [(x.start, x.end, x[3],
                                                         x[4], '')]

                else:

                    d["h{0}".format(i + 1)][x.chrom].append(
                        (x.start, x.end, x[3], x[4], ''))

            elif x[3] == 'approximate tandem repetition':

                column5 = x[4].split(':')  #info must be in this format

                if len(column5) != 3:

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) must be in string:integer:integer format'
                    )
                    sys.exit(1)

                if not (all(y in list(valid_dna) for y in column5[0].upper())
                        ):  #sequence must be a valid DNA string

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) contains an invalid DNA motif'
                    )
                    sys.exit(1)

                try:

                    int(column5[1])

                except:

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) must contain an integer specifying the number of repetitions'
                    )
                    sys.exit(1)

                try:

                    int(column5[2])

                except:

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) must contain an integer specifying the number of errors in repetition'
                    )
                    sys.exit(1)

                if int(x[5]) != 0:  #no random sequence at breakpoint: not a SV

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Warning] Line ' + str(j + 1) +
                        ': column 6 (length of random sequence to insert at breakpoint) coherced to 0'
                    )

                if x.chrom not in d["h{0}".format(i + 1)].keys():  #store

                    d["h{0}".format(i + 1)][x.chrom] = [(x.start, x.end, x[3],
                                                         x[4], '')]

                else:

                    d["h{0}".format(i + 1)][x.chrom].append(
                        (x.start, x.end, x[3], x[4], ''))

            elif x[3] == 'tandem repeat expansion' or x[
                    3] == 'tandem repeat contraction':  #same checks for these variants

                column5 = x[4].split(':')  #info must be in this format

                if len(column5) != 2:

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) must be in string:integer format'
                    )
                    sys.exit(1)

                if not (all(y in list(valid_dna) for y in column5[0].upper())
                        ):  #sequence must be a valid DNA string

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) contains an invalid DNA motif'
                    )
                    sys.exit(1)

                try:

                    int(column5[1])

                except:

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) must contain an integer specifying the number of repetitions to add or subtract'
                    )
                    sys.exit(1)

                if int(x[5]) != 0:  #no random sequence at breakpoint: not a SV

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Warning] Line ' + str(j + 1) +
                        ': column 6 (length of random sequence to insert at breakpoint) coherced to 0'
                    )

                if x.chrom not in d["h{0}".format(i + 1)].keys():  #store

                    d["h{0}".format(i + 1)][x.chrom] = [(x.start, x.end, x[3],
                                                         x[4], '')]

                else:

                    d["h{0}".format(i + 1)][x.chrom].append(
                        (x.start, x.end, x[3], x[4], ''))

            elif x[3] == 'reciprocal translocation':

                column5 = x[4].split(':')  #info must be in this format

                if len(column5) != 5:

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) must be in string:string:integer:string:string format'
                    )
                    sys.exit(1)

                if not haplopattern.match(column5[0]):

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) must contain a valid haplotype string'
                    )
                    sys.exit(1)

                if column5[1] not in chrs:

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) contains an invalid chromosome string (chromosome is not included in the reference provided)'
                    )
                    sys.exit(1)

                try:

                    int(column5[2])

                except:

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) must contain an integer specifying the breakpoint coordinate on the second chromosome'
                    )
                    sys.exit(1)

                if column5[3] not in {
                        'forward', 'reverse'
                } or column5[4] not in {'forward', 'reverse'}:

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) must contain valid orientations (either "forward" or "reverse")'
                    )
                    sys.exit(1)

                #translocate second to first

                newtype = 'deletion-insertion'

                #get second sequence[2]
                firstbase = int(column5[2])
                lastbase = int(column5[2]) + (x.end - x.start + 1)

                if firstbase <= 0 or firstbase > len(ref[column5[1]]):

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) breakpoint coordinate lies outside chromosome (lower than chromosome start or greater than chromosome end)'
                    )
                    sys.exit(1)

                if lastbase > len(
                        ref[column5[1]]
                ):  #last base can't be 0 as it is start (that can't be lower than 0) + something.

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) breakpoint coordinate lies outside chromosome (greater than chromosome end)'
                    )
                    sys.exit(1)

                if column5[4] == 'reverse':

                    transeq = ref[
                        column5[1]][firstbase:lastbase].reverse.complement.seq

                else:

                    transeq = ref[column5[1]][firstbase:lastbase].seq

                randomseq = ''.join(random.choices(valid_dna, k=int(x[5])))

                if x.chrom not in d["h{0}".format(i + 1)].keys():  #store

                    d["h{0}".format(i + 1)][x.chrom] = [
                        (x.start, x.end, newtype, transeq, randomseq)
                    ]

                else:

                    d["h{0}".format(i + 1)][x.chrom].append(
                        (x.start, x.end, newtype, transeq, randomseq))

                #translocate first to second

                if column5[0] not in d.keys():

                    d[column5[0]] = dict(
                    )  #initialize haplotype dict if not present

                #get first sequence

                if column5[3] == 'reverse':

                    transeq = ref[x.chrom][x.start -
                                           1:x.end].reverse.complement.seq

                else:

                    transeq = ref[x.chrom][x.start - 1:x.end].seq

                randomseq = ''.join(random.choices(valid_dna, k=int(x[5])))

                if column5[1] not in d[column5[0]].keys():  #store

                    d[column5[0]][column5[1]] = [(firstbase + 1, lastbase,
                                                  newtype, transeq, randomseq)]

                else:

                    d[column5[0]][column5[1]].append(
                        (firstbase + 1, lastbase, newtype, transeq, randomseq))

            elif x[3] == 'translocation cut-paste' or x[
                    3] == 'translocation copy-paste' or x[
                        3] == 'interspersed duplication':  #same info for these 2

                column5 = x[4].split(':')  #info must be in this format

                if len(column5) != 4:

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) must be in string:string:integer:string format'
                    )
                    sys.exit(1)

                if not haplopattern.match(column5[0]):

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) must contain a valid haplotype string'
                    )
                    sys.exit(1)

                if column5[1] not in chrs:

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) contains an invalid chromosome string (chromosome is not included in the reference provided)'
                    )
                    sys.exit(1)

                try:

                    int(column5[2])

                except:

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) must contain an integer specifying the breakpoint coordinate on the second chromosome'
                    )
                    sys.exit(1)

                if int(column5[2]) <= 0 or int(column5[2]) > len(
                        ref[column5[1]]):

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) breakpoint coordinate lies outside chromosome (lower than chromosome start or greater than chromosome end)'
                    )
                    sys.exit(1)

                if column5[3] not in {'forward', 'reverse'}:

                    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
                    print(
                        '[' + now + '][Error] Line ' + str(j + 1) +
                        ': column 5 (variant information) must contain a valid orientation (either "forward" or "reverse")'
                    )
                    sys.exit(1)

                if column5[0] not in d.keys():

                    d[column5[0]] = dict(
                    )  #initialize haplotype dict if not present

                if x[3] == 'translocation cut-paste':

                    newtype1 = 'deletion'
                    newtype2 = 'insertion'

                    if column5[3] == 'reverse':

                        transeq = ref[x.chrom][x.start -
                                               1:x.end].reverse.complement.seq

                    else:

                        transeq = ref[x.chrom][x.start - 1:x.end].seq

                    randomseq = ''.join(random.choices(valid_dna, k=int(x[5])))

                    #delete first

                    if x.chrom not in d["h{0}".format(i + 1)].keys():  #store

                        d["h{0}".format(i + 1)][x.chrom] = [
                            (x.start, x.end, newtype1, 'None', randomseq)
                        ]

                    else:

                        d["h{0}".format(i + 1)][x.chrom].append(
                            (x.start, x.end, newtype1, 'None', randomseq))

                    #insert in second

                    randomseq = ''.join(random.choices(valid_dna, k=int(x[5])))

                    if column5[1] not in d[column5[0]].keys():  #store

                        d[column5[0]][column5[1]] = [
                            (int(column5[2]) - 1, int(column5[2]), newtype2,
                             transeq, randomseq)
                        ]

                    else:

                        d[column5[0]][column5[1]].append(
                            (int(column5[2]) - 1, int(column5[2]), newtype2,
                             transeq, randomseq))

                else:  #is copy-paste/intersperded dup

                    newtype = 'insertion'

                    if column5[3] == 'reverse':

                        transeq = ref[x.chrom][x.start -
                                               1:x.end].reverse.complement.seq

                    else:

                        transeq = ref[x.chrom][x.start - 1:x.end].seq

                    randomseq = ''.join(random.choices(valid_dna, k=int(x[5])))

                    #only insert

                    if column5[1] not in d[column5[0]].keys():  #store

                        d[column5[0]][column5[1]] = [(int(column5[2]) - 1,
                                                      int(column5[2]), newtype,
                                                      transeq, randomseq)]

                    else:

                        d[column5[0]][column5[1]].append(
                            (int(column5[2]) - 1, int(column5[2]), newtype,
                             transeq, randomseq))

    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
    print('[' + now + '][Message] BED validated and variants organized')
    print('[' + now + '][Message] Generating modified FASTA haplotypes')

    for dicts in d.keys():

        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Message] Generating haplotype ' + dicts)
        hapout = os.path.abspath(c.OUT + '/' + dicts + '.fa')
        HapMaker(ref, chrs, d[dicts], hapout)
        now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
        print('[' + now + '][Message] Indexing FASTA haplotype')
        pyfaidx.Faidx(hapout)

    now = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
    print('[' + now + '][Message] Done')
    sys.exit(0)
targets_txt = 'https://raw.githubusercontent.com/calico/basenji/master/manuscripts/cross2020/targets_human.txt'
df_targets = pd.read_csv(targets_txt, sep='\t')
df_targets.head(3)

"""### Download files

Download and index the reference genome fasta file

Credit to Genome Reference Consortium: https://www.ncbi.nlm.nih.gov/grc

Schneider et al 2017 http://dx.doi.org/10.1101/gr.213611.116: Evaluation of GRCh38 and de novo haploid genome assemblies demonstrates the enduring quality of the reference assembly
"""

# !mkdir -p /root/data
# !wget -O - http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz | gunzip -c > {fasta_file}
pyfaidx.Faidx(fasta_file)
# !ls /root/data

"""Download the clinvar file. Reference:

Landrum MJ, Lee JM, Benson M, Brown GR, Chao C, Chitipiralla S, Gu B, Hart J, Hoffman D, Jang W, Karapetyan K, Katz K, Liu C, Maddipatla Z, Malheiro A, McDaniel K, Ovetsky M, Riley G, Zhou G, Holmes JB, Kattman BL, Maglott DR. ClinVar: improving access to variant interpretations and supporting evidence. Nucleic Acids Res . 2018 Jan 4. PubMed PMID: 29165669 .

"""

# !wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz -O /root/data/clinvar.vcf.gz

"""### Code (double click on the title to show the code)"""

# @title `Enformer`, `EnformerScoreVariantsNormalized`, `EnformerScoreVariantsPCANormalized`,
SEQUENCE_LENGTH = 393216
示例#15
0
def create_bam(sample,
               files_in1,
               files_in2,
               ref_fasta,
               probes_dict,
               output,
               has_trimmed_primers=True,
               debug=False):
    """
    Create a BAM file with reads placed at their expected locations, adjusted through pairwise alignment to the target sequences.

    This will give reasonable results as long as probes capture the exact target sequences, but will generate
    alignments with many mismatches if there are any discrepancies.
    """
    assert len(files_in1) == len(files_in2)

    tStart = time.time()
    counters = collections.Counter()

    ref_idx = pyfaidx.Faidx(ref_fasta, rebuild=False)
    bam_header = {
        'HD': {
            'VN': '1.0'
        },
        'SQ': [{
            'LN': record.rlen,
            'SN': name
        } for name, record in ref_idx.index.items()],
        'RG': [{
            'ID': sample,
            'SM': sample
        }],
        'PG': [{
            'ID': __title__,
            'PN': __title__,
            'VN': __version__
        }],
    }

    chr_indices = {
        chrom: index
        for index, chrom in enumerate(ref_idx.index.keys())
    }

    with pysam.AlignmentFile(output, "wb", header=bam_header) as pairedreads:
        for ixfile in range(len(files_in1)):
            file1 = files_in1[ixfile]
            file2 = files_in2[ixfile]
            log.info('Processing %s and %s (#%d)', file1, file2, ixfile)

            counters['files'] += 1
            opener = gzip.open if file1.endswith('.gz') else open
            with opener(file1, 'rt') as hdl1, opener(file2, 'rt') as hdl2:
                for read_pair in zip(
                        Bio.SeqIO.QualityIO.FastqGeneralIterator(hdl1),
                        Bio.SeqIO.QualityIO.FastqGeneralIterator(hdl2)):
                    counters['pairs_total'] += 1
                    if counters['pairs_total'] % 50000 == 0:
                        log.info(
                            "processed %d pairs - %.f sec elapsed, %.4f sec/pair, %.1f pairs/sec",
                            counters['pairs_total'],
                            time.time() - tStart,
                            (time.time() - tStart) / counters['pairs_total'],
                            counters['pairs_total'] / (time.time() - tStart))

                    if debug and counters['pairs_total'] > 10:
                        print('DEBUG - stopping after ',
                              counters['pairs_total'])
                        break

                    # extract and parse read name
                    read_names_original = [
                        read_pair[0][0].split('\t')[0],
                        read_pair[1][0].split('\t')[0],
                    ]
                    assert len(read_names_original[0]) > 0
                    assert read_names_original[0] == read_names_original[1]
                    read_name, read_probe, read_umi = parse_extended_read_name(
                        read_names_original[0])

                    probe_chr = probes_dict['chr'][read_probe]
                    if not probe_chr in chr_indices:
                        raise Exception(
                            'Probe {} is associated with chromosome {}, but this entry does not exist in the reference fasta file!'
                            .format(read_probe, probe_chr))
                    probe_chr_index = chr_indices[probe_chr]

                    read_lens = [
                        len(read_pair[read_number][1])
                        for read_number in range(2)
                    ]

                    # untested: if we haven't trimmed off the primers then we need to start aligning from the primer start location!
                    if has_trimmed_primers:
                        probe_start = int(
                            probes_dict['target_start_0'][read_probe] + 1)
                        probe_end = int(probes_dict['target_end'][read_probe])
                    else:
                        probe_start = int(
                            probes_dict['probe_start_0'][read_probe] + 1)
                        probe_end = int(probes_dict['probe_end'][read_probe])

                    if probes_dict['strand'][read_probe] == '+':
                        read_starts = [
                            probe_start, probe_end - read_lens[1] + 1
                        ]
                        read_reverse = [False, True]
                    elif probes_dict['strand'][read_probe] == '-':
                        read_starts = [
                            probe_end - read_lens[0] + 1, probe_start
                        ]
                        read_reverse = [True, False]
                    else:
                        raise Exception(
                            'Unexpected strand for probe {}'.format(
                                read_probe))

                    # NOTE: this SHOULD BE one-based based on documentation
                    # but actually seem to be ZERO-based -- at least the sequence we get for PRRX1-Ex1
                    # starts CGGA but should start GGA; ends TTC but should end TTCT if we just use probe_start and probe_end
                    # they are always in genomic sense
                    probe_target_sequence = str(
                        ref_idx.fetch(probe_chr, probe_start,
                                      probe_end)).upper()

                    # sanity check that we got the right sequence
                    if has_trimmed_primers:
                        assert len(probe_target_sequence) == probes_dict[
                            'target_length'][read_probe]
                    else:
                        assert len(probe_target_sequence
                                   ) == probes_dict['capture_size'][read_probe]

                    if debug:
                        print(read_name, read_probe, read_umi)
                        print(probe_chr, probe_chr_index, probe_start,
                              probe_end)
                        print(read_starts)
                        print(read_reverse)
                        print(probe_target_sequence)
                        print(read_pair)

                    try:
                        # pre-process alignments to make sure the mate starts are actually correct
                        read_cigars = []
                        read_sequences = []
                        read_tags_for_pysam = []
                        for read_number in range(2):
                            # copy over our custom tags from FASTQ file
                            read_tags = [
                                tag.split(':') for tag in
                                read_pair[read_number][0].split('\t')[1:]
                            ]
                            read_tags_for_pysam.append(
                                [("RG", sample, "Z")] +
                                [(tag_name, int(tag_value) if tag_type ==
                                  'i' else tag_value, tag_type) for tag_name,
                                 tag_type, tag_value in read_tags])
                            if debug:
                                print(read_tags)
                                print(read_tags_for_pysam[read_number])

                            # figure out sequence
                            read_sequence = str(
                                Bio.Seq.Seq(read_pair[read_number][1]).
                                reverse_complement()) if read_reverse[
                                    read_number] else read_pair[read_number][1]
                            read_sequences.append(read_sequence)

                            # align read to target sequence -- note both of these are in genomic sense!
                            try:
                                cigar_read_start_offset, cigartuples = align_and_find_cigar(
                                    read_sequence, probe_target_sequence)
                                read_tags_for_pysam[read_number].append(
                                    ("so", cigar_read_start_offset, 'i'))

                                # remember cigar
                                read_cigars.append(cigartuples)
                                # adjust start -- need to use zero-based coords here but probe_start is 1-based
                                read_starts[
                                    read_number] = probe_start - 1 + cigar_read_start_offset
                            except AssertionError:
                                cigar_read_start_offset, cigartuples = align_and_find_cigar(
                                    read_sequence,
                                    probe_target_sequence,
                                    debug=True)
                                raise

                        if debug:
                            print(read_cigars)
                            print(read_starts)

                        for read_number in range(2):

                            # create aligned segment
                            a = pysam.AlignedSegment()
                            a.mapping_quality = 255  #always best quality
                            a.query_name = read_names_original[0]
                            a.query_sequence = read_sequences[read_number]
                            a.query_qualities = pysam.qualitystring_to_array(
                                read_pair[read_number][2][::-1]
                                if read_reverse[read_number] else
                                read_pair[read_number][2])
                            a.set_tags(read_tags_for_pysam[read_number])
                            a.cigartuples = read_cigars[read_number]

                            a.reference_id = probe_chr_index
                            a.reference_start = read_starts[read_number]
                            a.next_reference_id = probe_chr_index
                            a.next_reference_start = read_starts[1 -
                                                                 read_number]
                            # a.template_length = read_lens[read_number]

                            a.is_paired = True
                            a.is_proper_pair = True
                            a.is_read1 = read_number == 0
                            a.is_read2 = read_number == 1
                            a.is_reverse = read_reverse[read_number]
                            a.mate_is_reverse = read_reverse[1 - read_number]

                            if debug:
                                print(a)
                            pairedreads.write(a)
                            if debug:
                                break
                    # normally we always get an alignment, but apparently sometimes we don't?
                    except AmplimapNoAlignment:
                        counters['no_alignment'] += 1
                        pass

    log.info('%s done - %d pairs in total, %d without alignment', sample,
             counters['pairs_total'], counters['no_alignment'])

    log.info("BAM file created: %s", output)
示例#16
0
def main():

    if ref == None:
        man.manpage()
        sys.exit()

    else:

        errlog.info(print('reading reference file: ' + str(ref) + "\n"))
        errlog.info(print('Indexing reference file....' + "\n"))

        basename = str(os.path.basename(outfilename))
        os.symlink(ref, basename)
        pyfaidx.Faidx(basename)
        cwd = os.getcwd()
        indexFile = ''
        for file in os.listdir(cwd):
            if file.endswith('.fai'):
                indexFile = (os.path.join('.', file))
    ref_index = process_inputFiles.parseIndexRef(indexFile)

    if args.se and countModel == None and diffmodel == None:
        print('running single-end in default mode')
        process_inputFiles.compilefastqrecord(readlen,
                                              ref,
                                              ref_index,
                                              sqmodel,
                                              outfilename,
                                              se_class,
                                              'se',
                                              model=None,
                                              diff=None,
                                              readtot=readtot,
                                              fragmodel=None)
    elif args.se and countModel != None and diffmodel == None:
        print('running single-end in empirical profile mode')
        process_inputFiles.compilefastqrecord(readlen,
                                              ref,
                                              ref_index,
                                              sqmodel,
                                              outfilename,
                                              se_class,
                                              'se',
                                              model=countModel,
                                              diff=None,
                                              readtot=readtot,
                                              fragmodel=None)
    elif args.se and countModel == None and diffmodel != None:
        print('running single-end in differential expression mode')
        process_inputFiles.compilefastqrecord(readlen,
                                              ref,
                                              ref_index,
                                              sqmodel,
                                              outfilename,
                                              se_class,
                                              'se',
                                              model=None,
                                              diff=diffmodel,
                                              readtot=readtot,
                                              fragmodel=None)
    elif args.pe and countModel == None and diffmodel == None and fragmodel == None:
        print('running paired-end in default mode with no FL model')
        process_inputFiles.compilefastqrecord(readlen,
                                              ref,
                                              ref_index,
                                              sqmodel,
                                              outfilename,
                                              se_class,
                                              'pe',
                                              model=None,
                                              diff=None,
                                              readtot=readtot,
                                              fragmodel=None)
    elif args.pe and countModel == None and diffmodel == None and fragmodel != None:
        print('running paired-end in default mode with FL model')
        process_inputFiles.compilefastqrecord(readlen,
                                              ref,
                                              ref_index,
                                              sqmodel,
                                              outfilename,
                                              se_class,
                                              'pe',
                                              model=None,
                                              diff=None,
                                              readtot=readtot,
                                              fragmodel=fragmodel)
    elif args.pe and countModel != None and diffmodel == None and fragmodel == None:
        print('running paired-end in empirical profile mode with no FL model')
        process_inputFiles.compilefastqrecord(readlen,
                                              ref,
                                              ref_index,
                                              sqmodel,
                                              outfilename,
                                              se_class,
                                              'pe',
                                              model=countModel,
                                              diff=None,
                                              readtot=readtot,
                                              fragmodel=None)
    elif args.pe and countModel != None and diffmodel == None and fragmodel != None:
        print('running paired-end in empirical profile mode with FL model')
        process_inputFiles.compilefastqrecord(readlen,
                                              ref,
                                              ref_index,
                                              sqmodel,
                                              outfilename,
                                              se_class,
                                              'pe',
                                              model=None,
                                              diff=None,
                                              readtot=readtot,
                                              fragmodel=fragmodel)
    elif args.pe and countModel == None and diffmodel != None and fragmodel == None:
        print('running paired-end in differential mode with no FL model')
        process_inputFiles.compilefastqrecord(readlen,
                                              ref,
                                              ref_index,
                                              sqmodel,
                                              outfilename,
                                              se_class,
                                              'pe',
                                              model=None,
                                              diff=diffmodel,
                                              readtot=readtot,
                                              fragmodel=None)
    elif args.pe and countModel == None and diffmodel != None and fragmodel != None:
        print('running paired-end in differential mode with FL model')
        process_inputFiles.compilefastqrecord(readlen,
                                              ref,
                                              ref_index,
                                              sqmodel,
                                              outfilename,
                                              se_class,
                                              'pe',
                                              model=countModel,
                                              diff=diffmodel,
                                              readtot=readtot,
                                              fragmodel=fragmodel)

    os.remove(basename)
    os.remove(indexFile)
    errlog.info(print('Simulation is complete'))
    end_time = datetime.now()
    print('Duration: {}'.format(end_time - start_time))
示例#17
0
def generate_fragments(
    x,
    fragSim_exe,
    fragSim_params,
    deamSim_exe,
    deamSim_params,
    adptSim_exe,
    adptSim_params,
    art_exe,
    art_params,
    libprep,
    tmp_dir,
    exp_data,
    debug,
    genome_table,
):
    genome_data = genome_table[(genome_table["Taxon"] == x["taxon"])]
    fasta = genome_data["Fasta_normalized"].item()
    # Create index
    fasta_seq = pyfaidx.Faidx(fasta)
    read_len = exp_data["read_length"]
    library = exp_data["library"]
    seqSys = exp_data["seqSys"]
    seq_depth = exp_data["n_reads"]
    files_modern = {}
    files_ancient = {}
    # Case when onlyAncient is False
    # Here we will need to run for modern and ancient
    if x["onlyAncient"] is False:
        if x["fragments_ancient"] is not None:
            # Run fragSim
            frag_type = "ancient"
            frag_data = prepare_data_fragments(x=x,
                                               frag_type=frag_type,
                                               tmp_dir=tmp_dir)
            run_fragSim(
                exe=fragSim_exe,
                params=fragSim_params["ancient"],
                seq_depth=frag_data["seq_depth"],
                ofile=frag_data["fragSim_ofile"],
                tmp_dir=tmp_dir,
                frag_ofile=frag_data["fragSim_frag_ofile"],
                fasta=fasta,
                debug=debug,
            )
            # Run deamSim
            run_deamSim(
                exe=deamSim_exe,
                params=deamSim_params,
                ofile=frag_data["deamSim_ofile"],
                fasta=frag_data["fragSim_ofile"],
                libprep=libprep,
                debug=debug,
            )
            # Regex to parse deamSim headers: (\S+):([+-]):(\d+):(\d+):(\d+)(?:_DEAM:(.*))?
            run_adptSim(
                exe=adptSim_exe,
                params=adptSim_params,
                ofile=frag_data["adptSim_ofile"],
                fasta=frag_data["deamSim_ofile"],
                read_len=read_len,
                library=library,
                debug=debug,
            )

            # rename_sequences(fasta=frag_data["adptSim_ofile"])

            run_art(
                exe=art_exe,
                params=art_params,
                seqSys=seqSys,
                fasta=frag_data["adptSim_ofile"],
                ofile=frag_data["art_ofile"],
                read_len=read_len,
                library=library,
                debug=debug,
            )

            files_ancient = collect_file_names(
                fragSim_ofile=frag_data["fragSim_ofile"],
                deamSim_ofile=frag_data["deamSim_ofile"],
                adptSim_ofile=frag_data["adptSim_ofile"],
                library=library,
                art_ofile=frag_data["art_ofile"],
                frag_type=frag_type,
                x=x,
            )

        if x["fragments_modern"] is not None:
            frag_type = "modern"
            # Run fragSim
            frag_data = prepare_data_fragments(x=x,
                                               frag_type=frag_type,
                                               tmp_dir=tmp_dir)
            run_fragSim(
                exe=fragSim_exe,
                params=fragSim_params["modern"],
                seq_depth=frag_data["seq_depth"],
                ofile=frag_data["fragSim_ofile"],
                tmp_dir=tmp_dir,
                frag_ofile=frag_data["fragSim_frag_ofile"],
                fasta=fasta,
                debug=debug,
            )
            run_adptSim(
                exe=adptSim_exe,
                params=adptSim_params,
                ofile=frag_data["adptSim_ofile"],
                fasta=frag_data["fragSim_ofile"],
                read_len=read_len,
                library=library,
                debug=debug,
            )
            run_art(
                exe=art_exe,
                params=art_params,
                seqSys=seqSys,
                fasta=frag_data["adptSim_ofile"],
                ofile=frag_data["art_ofile"],
                read_len=read_len,
                library=library,
                debug=debug,
            )
            files_modern = collect_file_names(
                fragSim_ofile=frag_data["fragSim_ofile"],
                deamSim_ofile=frag_data["deamSim_ofile"],
                adptSim_ofile=frag_data["adptSim_ofile"],
                library=library,
                art_ofile=frag_data["art_ofile"],
                frag_type=frag_type,
                x=x,
            )

    if x["onlyAncient"] is True:
        frag_type = "ancient"
        # Run fragSim
        frag_data = prepare_data_fragments(x=x,
                                           frag_type=frag_type,
                                           tmp_dir=tmp_dir)
        run_fragSim(
            exe=fragSim_exe,
            params=fragSim_params["ancient"],
            seq_depth=frag_data["seq_depth"],
            ofile=frag_data["fragSim_ofile"],
            tmp_dir=tmp_dir,
            frag_ofile=frag_data["fragSim_frag_ofile"],
            fasta=fasta,
            debug=debug,
        )
        # Run deamSim
        run_deamSim(
            exe=deamSim_exe,
            params=deamSim_params,
            ofile=frag_data["deamSim_ofile"],
            fasta=frag_data["fragSim_ofile"],
            libprep=libprep,
            debug=debug,
        )
        run_adptSim(
            exe=adptSim_exe,
            params=adptSim_params,
            ofile=frag_data["adptSim_ofile"],
            fasta=frag_data["deamSim_ofile"],
            read_len=read_len,
            library=library,
            debug=debug,
        )
        run_art(
            exe=art_exe,
            params=art_params,
            seqSys=seqSys,
            fasta=frag_data["adptSim_ofile"],
            ofile=frag_data["art_ofile"],
            read_len=read_len,
            library=library,
            debug=debug,
        )
        files_ancient = collect_file_names(
            fragSim_ofile=frag_data["fragSim_ofile"],
            deamSim_ofile=frag_data["deamSim_ofile"],
            adptSim_ofile=frag_data["adptSim_ofile"],
            library=library,
            art_ofile=frag_data["art_ofile"],
            frag_type=frag_type,
            x=x,
        )
        # Regex to parse deamSim headers: (\S+):([+-]):(\d+):(\d+):(\d+)(?:_DEAM:(.*))?
    if x["onlyAncient"] is None:
        frag_type = "modern"
        # Run fragSim
        frag_data = prepare_data_fragments(x=x,
                                           frag_type=frag_type,
                                           tmp_dir=tmp_dir)
        run_fragSim(
            exe=fragSim_exe,
            params=fragSim_params["modern"],
            seq_depth=frag_data["seq_depth"],
            ofile=frag_data["fragSim_ofile"],
            tmp_dir=tmp_dir,
            frag_ofile=frag_data["fragSim_frag_ofile"],
            fasta=fasta,
            debug=debug,
        )
        run_adptSim(
            exe=adptSim_exe,
            params=adptSim_params,
            ofile=frag_data["adptSim_ofile"],
            fasta=frag_data["fragSim_ofile"],
            read_len=read_len,
            library=library,
            debug=debug,
        )
        run_art(
            exe=art_exe,
            params=art_params,
            seqSys=seqSys,
            fasta=frag_data["adptSim_ofile"],
            ofile=frag_data["art_ofile"],
            read_len=read_len,
            library=library,
            debug=debug,
        )
        files_modern = collect_file_names(
            fragSim_ofile=frag_data["fragSim_ofile"],
            deamSim_ofile=frag_data["deamSim_ofile"],
            adptSim_ofile=frag_data["adptSim_ofile"],
            library=library,
            art_ofile=frag_data["art_ofile"],
            frag_type=frag_type,
            x=x,
        )

    os.remove(frag_data["fragSim_frag_ofile"])
    return Returnvalue(files_ancient, files_modern)
示例#18
0
def main(progname=None):
    t0 = time.time()
    prog = progname if progname else sys.argv[0]
    parser = argparse.ArgumentParser(
        description=
        'Build tumor genomes from somatic variants (encoded in the chain file)',
        prog=prog)
    parser.add_argument(
        '-c',
        '--chain',
        required=True,
        type=check_folder,
        metavar='DIR',
        help='the folder containing the chain files of tumor genomes')
    parser.add_argument(
        '-n',
        '--normal',
        required=True,
        type=check_normal_fastas,
        metavar='FILES',
        help='two fasta files (separated by comma) of normal genome')
    default = 'tumor_fa'
    parser.add_argument('-o',
                        '--output',
                        default=default,
                        type=check_output_folder,
                        metavar='DIR',
                        help='output directory [{}]'.format(default))
    default = 50
    parser.add_argument(
        '-w',
        '--width',
        default=default,
        type=int,
        metavar='INT',
        help='the line width of output fasta files [{}]'.format(default))
    default = 1
    parser.add_argument(
        '--cores',
        type=int,
        default=default,
        metavar='INT',
        help='number of cores used to run the program [{}]'.format(default))

    args = parser.parse_args()

    os.mkdir(args.output, mode=0o755)
    normal_fa = args.normal.split(',')
    for fa in normal_fa:
        pyfaidx.Faidx(fa)
    pool = multiprocessing.Pool(processes=args.cores)
    results = []
    for node_chain in glob.glob(os.path.join(args.chain, 'node*.chain')):
        results.append(
            pool.apply_async(build_fasta,
                             args=(args.output, node_chain, normal_fa,
                                   args.width)))
    pool.close()
    pool.join()
    #handle exceptions if any
    for result in results:
        result.get()

    t1 = time.time()
    print("Total time running {}: {} seconds".format(prog, str(t1 - t0)))