예제 #1
0
    LOG.info("Parsing VCF file...")

    # not all chromosomes/seqid will be processed if not in vcf file
    processed_seqids = OrderedDict()

    for seqid in tb.contigs:
        processed_seqids[seqid] = False

    left = VCFtoChainInfo()
    right = VCFtoChainInfo()

    chain_info = {}

    if diploid:
        left.output_file = g2g_fu.prepend_before_extension(output_file, 'left')
        right.output_file = g2g_fu.prepend_before_extension(
            output_file, 'right')
        chain_info['left'] = left
        chain_info['right'] = right

        g2g_fu.delete_file(left.output_file)
        g2g_fu.delete_file(right.output_file)
    else:
        left.output_file = output_file
        chain_info['left'] = left

        g2g_fu.delete_file(left.output_file)

    try:
        all_chrom = [c for c in fasta_file.references]
예제 #2
0
def prepare_fasta_patch(filename_fasta, filename_output, bgzip=False, diploid=False):
    """
    Initialize fasta_patch variables

    :param filename_fasta:
    :param filename_vcf:
    :param strain:
    :param filename_output:
    :param bgzip:
    :param diploid:
    :return:
    """

    filename_output = g2g_fu.check_file(filename_output, 'w')
    output_file_dir = os.path.abspath(os.path.dirname(filename_output))

    new_filename_output = filename_output

    # let's figure out what our output names will be
    if filename_output.lower().endswith('.gz'):
        # strip off .gz
        new_filename_output = filename_output[:-3]

    if not filename_output.lower().endswith('.fa'):
        raise G2GValueError("Expecting output filename extension to be either '.fa.gz' or '.fa'")


    if diploid:
        filename_output_l = g2g_fu.prepend_before_extension(new_filename_output, 'l')
        filename_output_r = g2g_fu.prepend_before_extension(new_filename_output, 'r')

        g2g_fu.delete_index_files(filename_output_l)
        g2g_fu.delete_index_files(filename_output_r)
    else:
        filename_output_l = new_filename_output
        filename_output_r = None

        g2g_fu.delete_index_files(filename_output_l)

    # at this point we are hoping for a .fa extension

    # let's figure out our input and process accordingly
    if filename_fasta.lower().endswith('.fa.gz'):
        # decompress the fasta file if it is compressed

        LOG.info("Copying and decompressing fasta file")

        # copy file and preserve gz extension for bgzip -d to work
        tmp_file_name = os.path.basename(filename_fasta)                        # something.gz
        LOG.debug("tmp_file_name={0}".format(tmp_file_name))

        tmp_fasta = os.path.join(output_file_dir, tmp_file_name)                # /path/something.fa.gz
        LOG.debug("tmp_fasta={0}".format(tmp_fasta))

        LOG.debug("COPYING {0} to {1}".format(filename_fasta, tmp_fasta))
        shutil.copy(filename_fasta, tmp_fasta)  # cp /original/something.fa.gz /output/something.fa.gz

        LOG.debug("DECOMPRESSING {0}".format(tmp_fasta))
        g2g_fu.bgzip_decompress(tmp_fasta)

        tmp_fasta = tmp_fasta[:-3]         # /path/something.fa
        LOG.debug("tmp_fasta={0}".format(tmp_fasta))

        LOG.debug("Moving '{0}' to '{1}'...".format(tmp_fasta, filename_output_l))
        shutil.move(tmp_fasta, filename_output_l)

    elif filename_fasta.lower().endswith('.fa'):
        LOG.debug("File is not compressed")

        LOG.debug("COPYING {0} to {1}".format(filename_fasta, filename_output_l))
        shutil.copy(filename_fasta, filename_output_l)
    else:
        raise G2GValueError("Expecting input filename extension to be either '.fa.gz' or '.fa'")

    if diploid:
        LOG.debug("Copying '{0}' to '{1}'...".format(filename_output_l, filename_output_r))
        shutil.copy(filename_output_l, filename_output_r)

    # build a temporary fasta index
    pysam.FastaFile(filename_output_l)

    return filename_output_l, filename_output_r
예제 #3
0
def prepare_fasta_patch(filename_fasta, filename_output, bgzip=False, diploid=False):
    """
    Initialize fasta_patch variables

    :param filename_fasta:
    :param filename_vcf:
    :param strain:
    :param filename_output:
    :param bgzip:
    :param diploid:
    :return:
    """

    filename_output = g2g_fu.check_file(filename_output, "w")
    output_file_dir = os.path.abspath(os.path.dirname(filename_output))

    new_filename_output = filename_output

    # let's figure out what our output names will be
    if filename_output.lower().endswith(".gz"):
        # strip off .gz
        new_filename_output = filename_output[:-3]

    if not filename_output.lower().endswith(".fa"):
        raise G2GValueError("Expecting output filename extension to be either '.fa.gz' or '.fa'")

    if diploid:
        filename_output_l = g2g_fu.prepend_before_extension(new_filename_output, "l")
        filename_output_r = g2g_fu.prepend_before_extension(new_filename_output, "r")

        g2g_fu.delete_index_files(filename_output_l)
        g2g_fu.delete_index_files(filename_output_r)
    else:
        filename_output_l = new_filename_output
        filename_output_r = None

        g2g_fu.delete_index_files(filename_output_l)

    # at this point we are hoping for a .fa extension

    # let's figure out our input and process accordingly
    if filename_fasta.lower().endswith(".fa.gz"):
        # decompress the fasta file if it is compressed

        LOG.info("Copying and decompressing fasta file")

        # copy file and preserve gz extension for bgzip -d to work
        tmp_file_name = os.path.basename(filename_fasta)  # something.gz
        LOG.debug("tmp_file_name={0}".format(tmp_file_name))

        tmp_fasta = os.path.join(output_file_dir, tmp_file_name)  # /path/something.fa.gz
        LOG.debug("tmp_fasta={0}".format(tmp_fasta))

        LOG.debug("COPYING {0} to {1}".format(filename_fasta, tmp_fasta))
        shutil.copy(filename_fasta, tmp_fasta)  # cp /original/something.fa.gz /output/something.fa.gz

        LOG.debug("DECOMPRESSING {0}".format(tmp_fasta))
        g2g_fu.bgzip_decompress(tmp_fasta)

        tmp_fasta = tmp_fasta[:-3]  # /path/something.fa
        LOG.debug("tmp_fasta={0}".format(tmp_fasta))

        LOG.debug("Moving '{0}' to '{1}'...".format(tmp_fasta, filename_output_l))
        shutil.move(tmp_fasta, filename_output_l)

    elif filename_fasta.lower().endswith(".fa"):
        LOG.debug("File is not compressed")

        LOG.debug("COPYING {0} to {1}".format(filename_fasta, filename_output_l))
        shutil.copy(filename_fasta, filename_output_l)
    else:
        raise G2GValueError("Expecting input filename extension to be either '.fa.gz' or '.fa'")

    if diploid:
        LOG.debug("Copying '{0}' to '{1}'...".format(filename_output_l, filename_output_r))
        shutil.copy(filename_output_l, filename_output_r)

    # build a temporary fasta index
    pysam.FastaFile(filename_output_l)

    return filename_output_l, filename_output_r
예제 #4
0
    LOG.info("Parsing VCF file...")

    # not all chromosomes/seqid will be processed if not in vcf file
    processed_seqids = OrderedDict()

    for seqid in tb.contigs:
        processed_seqids[seqid] = False

    left = VCFtoChainInfo()
    right = VCFtoChainInfo()

    chain_info = {}

    if diploid:
        left.output_file = g2g_fu.prepend_before_extension(output_file, 'left')
        right.output_file = g2g_fu.prepend_before_extension(output_file, 'right')
        chain_info['left'] = left
        chain_info['right'] = right

        g2g_fu.delete_file(left.output_file)
        g2g_fu.delete_file(right.output_file)
    else:
        left.output_file = output_file
        chain_info['left'] = left

        g2g_fu.delete_file(left.output_file)

    try:
        all_chrom = [c for c in fasta_file.references]
        all_chrom_length = [n for n in fasta_file.lengths]