Exemplo n.º 1
0
def maf2mvf(args):
    """Main method"""
    # ESTABLISH MAF
    args.qprint("Starting ConvertMAF2MVF")
    maf = MultiAlignFile(args)
    args.qprint("MAF Established")
    # ESTABLISH MVF
    mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    args.qprint("MVF output initialized")
    # PROCESS SAMPLE INFO
    contig_translate = {1: 1}
    samplelabels = [s.split(':')[0] for s in args.sample_tags.split(',')]
    args.qprint("Sample tags processed: {}".format(samplelabels))
    if args.ref_tag not in samplelabels:
        raise IndexError("--ref-tag not in the tags listed in --sample-tags")
    samplelabels.remove(args.ref_tag)
    samplelabels.insert(0, args.ref_tag)
    mvf.sample_ids = samplelabels[:]
    mvf.sample_indices = list(range(len(mvf.sample_ids)))
    for i, label in enumerate(samplelabels):
        mvf.sample_data[i] = {'id': label, 'index': i}
    mvf.reset_max_sample()
    mvf.metadata['sourceformat'] = maf.metadata['sourceformat']
    mvf.metadata.notes.append(args.command_string)
    # WRITE MVF HEADER
    mvf.write_data(mvf.get_header())
    args.qprint("MAF Headers Written")
    mvfentries = []
    nentry = 0
    total_entries = 0
    args.qprint("Begin data conversion")
    for pos, length, msa in maf:
        for sname in samplelabels:
            if sname not in msa:
                msa[sname] = '-'*length
        msa['contig'] = 1
        for i in range(length):
            mvf_alleles = encode_mvfstring(
                ''.join(msa[s][i].strip() for s in samplelabels))
            if mvf_alleles:
                mvfentries.append(
                    (contig_translate.get(msa['contig']),
                     pos+i, (mvf_alleles,)))
                nentry += 1
                if nentry == args.line_buffer:
                    total_entries += nentry
                    mvf.write_entries(mvfentries, encoded=True)
                    args.qprint("{} entries written".format(total_entries))
                    mvfentries = []
                    nentry = 0
    if mvfentries:
        total_entries += nentry
        mvf.write_entries(mvfentries)
        args.qprint("{} entries written".format(total_entries))
    args.qprint("Complete.")
    return ''
Exemplo n.º 2
0
def merge_mvf(args):
    """Main method"""
    args.qprint("Running MergeMVF")
    if any(fpath.endswith('.gz') for fpath in args.mvf):
        print("WARNING! Running MergeMVF with gzipped input files is "
              "extremely slow and strongly discouraged.")
    concatmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    # Copy the first file's metadata
    args.qprint("Reading First File and Establishing Output")
    if args.main_header_file:
        if args.main_header_file not in args.mvf:
            raise RuntimeError("{} not found in files".format(
                args.main_header_file))
        args.main_header_file = args.mvf.index(args.main_header_file)
    else:
        args.main_header_file = 0
    first_mvf = MultiVariantFile(args.mvf[args.main_header_file], 'read')
    concatmvf.copy_header(first_mvf)
    # Open each MVF file, read headers to make unified header
    transformers = []
    mvfmetadata = []
    inputfiles = []
    for mvfname in args.mvf:
        args.qprint("Reading headers from {}".format(mvfname))
        # This will create a dictionary of samples{old:new}, contigs{old:new}
        args.qprint("Processing Headers and Indexing: {}".format(mvfname))
        transformer = MvfTransformer()
        mvf = MultiVariantFile(mvfname,
                               'read',
                               contigindex=(not args.skip_index))
        if args.skip_index:
            mvf.read_index_file()
        mvf.reset_max_contig()
        mvfmetadata.append(mvf.metadata)
        for i, sid in enumerate(mvf.get_sample_ids()):
            if sid not in concatmvf.get_sample_ids():
                new_sindex = concatmvf.max_sample_index + 0
                concatmvf.max_sample_index += 1
                concatmvf.sample_indices.append(new_sindex)
                concatmvf.sample_ids.append(sid)
                concatmvf.sample_data[new_sindex] = {}
                concatmvf.sample_data[new_sindex]['id'] = sid
                concatmvf.sample_id_to_index[sid] = new_sindex
            transformer.set_label(i, concatmvf.sample_id_to_index[sid])
        for cindex in mvf.contig_indices:
            if (mvf.contig_data[cindex]['label']
                    not in concatmvf.contig_label_to_index):
                new_cindex = (mvf.contig_data[cindex]['id']
                              if mvf.contig_data[cindex]['id']
                              not in concatmvf.contig_ids else
                              concatmvf.get_next_contig_index())
                concatmvf.contig_data[new_cindex] = (
                    mvf.contig_data[cindex].copy())
            else:
                new_cindex = concatmvf.contig_label_to_index[
                    mvf.contig_data[cindex]['label']]
            transformer.set_contig(cindex, new_cindex)
        transformers.append(transformer)
        inputfiles.append(mvf)
    # Write output header
    args.qprint("Writing headers to merge output")
    concatmvf.reset_max_sample()
    concatmvf.notes.append(args.command_string)
    concatmvf.write_data(concatmvf.get_header())
    # Now loop through each file
    blank_entry = '-' * len(concatmvf.sample_indices)
    for cons_contig in concatmvf.contig_indices:
        contig_merged_entries = {}
        args.qprint("Merging Contig Index: {}".format(cons_contig))
        for ifile, mvffile in enumerate(inputfiles):
            if cons_contig not in transformers[ifile].contigs:
                continue
            localcontig = transformers[ifile].contigs[cons_contig]
            if 'idx' not in mvffile.contig_data[localcontig]:
                print("not found")
                continue
            for _, pos, allelesets in mvffile.itercontigentries(localcontig,
                                                                decode=True):
                if pos not in contig_merged_entries:
                    contig_merged_entries[pos] = blank_entry[:]
                for j, base in enumerate(allelesets[0]):
                    xcoord = transformers[ifile].labels_rev[j]
                    if contig_merged_entries[pos][xcoord] != '-':
                        if contig_merged_entries[pos][xcoord] == base:
                            continue
                        if base in '-X':
                            continue
                        raise RuntimeError(
                            ("Merging columns have two different bases: "
                             "{} {} {}").format(
                                 pos, contig_merged_entries[pos][xcoord],
                                 base))
                    contig_merged_entries[pos] = (
                        contig_merged_entries[pos][:xcoord] + base +
                        contig_merged_entries[pos][xcoord + 1:])
        if contig_merged_entries:
            concatmvf.write_entries(
                ((cons_contig, coord, (entry, ))
                 for coord, entry in sorted(contig_merged_entries.items())),
                encoded=False)
        args.qprint("Entries written for contig {}: {}".format(
            cons_contig, len(contig_merged_entries)))
    return ''