Exemplo n.º 1
0
def maf2mvf(args):
    """Main method"""
    # ESTABLISH MAF
    args.qprint("Starting ConvertMAF2MVF")
    maf = MultiAlignFile(args)
    args.qprint("MAF Established")
    # ESTABLISH MVF
    mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    args.qprint("MVF output initialized")
    # PROCESS SAMPLE INFO
    contig_translate = {1: 1}
    samplelabels = [s.split(':')[0] for s in args.sample_tags.split(',')]
    args.qprint("Sample tags processed: {}".format(samplelabels))
    if args.ref_tag not in samplelabels:
        raise IndexError("--ref-tag not in the tags listed in --sample-tags")
    samplelabels.remove(args.ref_tag)
    samplelabels.insert(0, args.ref_tag)
    mvf.sample_ids = samplelabels[:]
    mvf.sample_indices = list(range(len(mvf.sample_ids)))
    for i, label in enumerate(samplelabels):
        mvf.sample_data[i] = {'id': label, 'index': i}
    mvf.reset_max_sample()
    mvf.metadata['sourceformat'] = maf.metadata['sourceformat']
    mvf.metadata.notes.append(args.command_string)
    # WRITE MVF HEADER
    mvf.write_data(mvf.get_header())
    args.qprint("MAF Headers Written")
    mvfentries = []
    nentry = 0
    total_entries = 0
    args.qprint("Begin data conversion")
    for pos, length, msa in maf:
        for sname in samplelabels:
            if sname not in msa:
                msa[sname] = '-'*length
        msa['contig'] = 1
        for i in range(length):
            mvf_alleles = encode_mvfstring(
                ''.join(msa[s][i].strip() for s in samplelabels))
            if mvf_alleles:
                mvfentries.append(
                    (contig_translate.get(msa['contig']),
                     pos+i, (mvf_alleles,)))
                nentry += 1
                if nentry == args.line_buffer:
                    total_entries += nentry
                    mvf.write_entries(mvfentries, encoded=True)
                    args.qprint("{} entries written".format(total_entries))
                    mvfentries = []
                    nentry = 0
    if mvfentries:
        total_entries += nentry
        mvf.write_entries(mvfentries)
        args.qprint("{} entries written".format(total_entries))
    args.qprint("Complete.")
    return ''
Exemplo n.º 2
0
def legacy_annotate_mvf(args):
    """Main method"""
    args.qprint("Running LegacyAnnotateMVF")
    mvf = MultiVariantFile(args.mvf, 'read')
    args.qprint("Input MVF header processed.")
    args.qprint("MVF flavor: {}".format(mvf.flavor))
    gff, geneids = parse_gff_legacy_annotate(
        args.gff, mvf.contig_data, gene_pattern=args.gene_pattern)
    args.qprint("GFF processed.")
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite,
                              flavor=mvf.flavor)
    outmvf.copy_headers_from(mvf)
    if args.nongenic_mode is False:
        outmvf.contig_data = geneids.copy()
        outmvf.contig_indices = list(range(len(geneids)))
        outmvf.contig_ids = [geneids[x]['id'] for x in
                             outmvf.contig_indices]
        outmvf.contig_labels = [geneids[x]['label'] for x in
                                outmvf.contig_indices]
    outmvf.write_data(outmvf.get_header())
    args.qprint("Output MVF established.")
    entrybuffer = []
    nentry = 0
    args.qprint("Processing MVF entries.")
    for contigid, pos, allelesets in mvf.iterentries(decode=False):
        annotated_pos = None
        if contigid in gff:
            for (xgeneid, xstart, xstop) in gff[contigid]:
                if xstart < pos < xstop:
                    annotated_pos = xgeneid + 0
                    break
                if args.nongenic_mode is True and args.unmargin > 0:
                    for xpos in range(pos - args.unmargin,
                                      pos + args.unmargin + 1):
                        if xstart < xpos < xstop:
                            annotated_pos = xgeneid + 0
                            break
        if annotated_pos is not None and not args.nongenic_mode:
            entrybuffer.append((annotated_pos, pos, allelesets))
        elif args.nongenic_mode and annotated_pos is None:
            entrybuffer.append((contigid, pos, allelesets))
        if args.nongenic_mode or annotated_pos is not None:
            nentry += 1
            if nentry == args.line_buffer:
                args.qprint("Writing block of entries.")
                outmvf.write_entries(entrybuffer)
                entrybuffer = []
                nentry = 0
    if entrybuffer:
        outmvf.write_entries(entrybuffer)
        args.qprint("Writing final block of entries.")
        entrybuffer = []
        nentry = 0
    return ''
Exemplo n.º 3
0
def annotate_mvf(args):
    """Main method"""
    args.qprint("Running AnnotateMVF")
    mvf = MultiVariantFile(args.mvf, 'read')
    args.qprint("Input MVF header processed.")
    args.qprint("MVF flavor: {}".format(mvf.metadata['flavor']))
    gff, geneids = parse_gff_annotate(args.gff, mvf.metadata['contigs'],
                                      gene_prefix=args.gene_prefix)
    args.qprint("GFF processed.")
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite,
                              flavor=mvf.metadata['flavor'])
    outmvf.metadata = deepcopy(mvf.metadata)
    if args.nongenic_mode is False:
        outmvf.metadata['contigs'] = geneids
    outmvf.write_data(outmvf.get_header())
    args.qprint("Output MVF established.")
    entrybuffer = []
    nentry = 0
    args.qprint("Processing MVF entries.")
    for contigid, pos, allelesets in mvf.iterentries(decode=False):
        annotated_pos = False
        if contigid in gff:
            if pos in gff[contigid]:
                annotated_pos = True
            elif args.nongenic_mode is True and args.unmargin > 0:
                for xpos in range(pos - args.unmargin,
                                  pos + args.unmargin + 1):
                    if xpos in gff[contigid]:
                        annotated_pos = True
                        break
        if annotated_pos and not args.nongenic_mode:
            entrybuffer.append((gff[contigid][pos], pos, allelesets))
        elif args.nongenic_mode and not annotated_pos:
            entrybuffer.append((contigid, pos, allelesets))
        if args.nongenic_mode or annotated_pos:
            nentry += 1
            if nentry == args.line_buffer:
                args.qprint("Writing block of entries.")
                outmvf.write_entries(entrybuffer)
                entrybuffer = []
                nentry = 0
    if entrybuffer:
        outmvf.write_entries(entrybuffer)
        args.qprint("Writing final block of entries.")
        entrybuffer = []
        nentry = 0
    return ''
Exemplo n.º 4
0
def annotate_mvf(args):
    """Main method"""
    mvf = MultiVariantFile(args.mvf, 'read')
    gff, geneids = parse_gff_annotate(args.gff, mvf.metadata['contigs'])
    if args.quiet is False:
        print("gff_processed")
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.metadata = deepcopy(mvf.metadata)
    if args.nongenic_mode is False:
        outmvf.metadata['contigs'] = geneids
    outmvf.write_data(outmvf.get_header())
    entrybuffer = []
    nentry = 0
    for contigid, pos, allelesets in mvf.iterentries(decode=False):
        annotated_pos = False
        if contigid in gff:
            if pos in gff[contigid]:
                annotated_pos = True
            elif args.nongenic_mode is True and args.unmargin > 0:
                for xpos in range(pos - args.unmargin,
                                  pos + args.unmargin + 1):
                    if xpos in gff[contigid]:
                        annotated_pos = True
                        break
        if args.nongenic_mode is False and annotated_pos is True:
            entrybuffer.append((gff[contigid][pos], pos, allelesets))
            nentry += 1
            if nentry == args.line_buffer:
                outmvf.write_entries(entrybuffer)
                entrybuffer = []
                nentry = 0
        elif args.nongenic_mode is True and annotated_pos is False:
            entrybuffer.append((contigid, pos, allelesets))
            nentry += 1
            if nentry == args.line_buffer:
                outmvf.write_entries(entrybuffer)
                entrybuffer = []
                nentry = 0
    if entrybuffer:
        outmvf.write_entries(entrybuffer)
        entrybuffer = []
        nentry = 0
    return ''
Exemplo n.º 5
0
def maf2mvf(args):
    """Main method"""
    # ESTABLISH MAF
    maf = MultiAlignFile(args)
    # ESTABLISH MVF
    mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    # PROCESS SAMPLE INFO
    contig_translate = {1: 1}
    samplelabels = [s.split(':')[0] for s in args.sample_tags]
    samplelabels.remove(args.ref_tag)
    samplelabels.insert(0, args.ref_tag)
    mvf.metadata['labels'] = samplelabels[:]
    for i, label in enumerate(samplelabels):
        mvf.metadata['samples'][i] = {'label': label}
    mvf.metadata['ncol'] = len(mvf.metadata['labels'])
    mvf.metadata['sourceformat'] = maf.metadata['sourceformat']
    # WRITE MVF HEADER
    mvf.write_data(mvf.get_header())
    mvfentries = []
    nentry = 0
    for pos, length, msa in maf:
        for sname in samplelabels:
            if sname not in msa:
                msa[sname] = '-'*length
        msa['contig'] = 1
        for i in range(length):
            mvf_alleles = encode_mvfstring(
                ''.join(msa[s][i].strip() for s in samplelabels))
            if mvf_alleles:
                mvfentries.append(
                    (contig_translate.get(msa['contig']),
                     pos+i, (mvf_alleles,)))
                nentry += 1
                if nentry == args.line_buffer:
                    mvf.write_entries(mvfentries, encoded=True)
                    mvfentries = []
                    nentry = 0
    if mvfentries:
        mvf.write_entries(mvfentries)
    return ''
Exemplo n.º 6
0
def fasta2mvf(args):
    """Main method"""
    sepchars = dict([("PIPE", "\\|"), ("TAB", "\\t"), ("SPACE", "\\s"),
                     ("DBLSPACE", "\\s\\s"), ("COMMA", "\\,"), ("NONE", None),
                     ("AT", "\\@"), ('UNDER', "\\_"), ("DBLUNDER", "\\_\\_")])
    if args.field_sep is None:
        args.field_sep = ''
    else:
        args.field_sep = re.compile("[{}]".format(''.join(
            [sepchars[x] for x in args.field_sep])))
    if args.manual_coord:
        assert len(args.manual_coord) == len(args.fasta)
        args.manual_coord = [(x.split(':')[0],
                              int(x.split(":")[1].split('..')[0]),
                              int(x.split(':')[1].split('..')[1]))
                             for x in args.manual_coord]
    mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    fasta = {}
    current_contig = 0
    fsamples = []
    fcontigs = []
    for ifasta, fastapath in enumerate(args.fasta):
        print("Processing {}".format(fastapath))
        for header, seq in fasta_iter(fastapath):
            if args.field_sep is None:
                header = header[:]
            if args.field_sep != '' and args.field_sep is not None:
                header = [str(x) for x in re.split(args.field_sep, header)]
            if args.contig_by_file is True:
                contig = os.path.basename(fastapath[:])
                if args.sample_field is None:
                    sample = header[:]
                else:
                    sample = header[args.sample_field]
            elif (len(header) < max(
                    args.contig_field if args.contig_field is not None else 0,
                    args.sample_field if args.sample_field is not None else 0)
                  or args.contig_field is None or args.sample_field is None):
                contig = "UNK{}".format(current_contig)
                sample = header[:]
            elif args.manual_coord:
                contig = args.manual_coord[ifasta][0]
            else:
                contig = header[args.contig_field]
                sample = header[args.sample_field]
            if contig not in fcontigs:
                fcontigs.append(contig)
                fasta[contig] = {}
            if sample not in fsamples:
                fsamples.append(sample)
            fasta[contig][sample] = (len(seq), seq)
    reflabel = None
    if args.ref_label:
        for i, samplename in enumerate(fsamples):
            if args.ref_label in samplename:
                reflabel = i
                break
    if reflabel:
        newref = fsamples.pop(i)
        fsamples = [newref] + fsamples
    for i, contig in enumerate(fcontigs):
        new_index = mvf.get_next_contig_index()
        mvf.contig_indices.append(new_index)
        mvf.contig_ids.append(str(new_index))
        mvf.contig_labels.append(contig)
        mvf.contig_label_to_index[contig] = new_index
        mvf.contig_id_to_index[str(new_index)] = new_index
        mvf.contig_data[new_index] = {
            'label': contig,
            'id': str(new_index),
            'length': max([fasta[contig][x][0] for x in fasta[contig]])
        }
    mvf.metadata['labels'] = fsamples[:]
    for i, label in enumerate(fsamples[:]):
        mvf.sample_indices.append(i)
        mvf.sample_id_to_index[label] = i
        mvf.sample_ids.append(label)
        mvf.sample_data[i] = {'id': label}
    mvf.metadata['ncol'] = len(mvf.metadata['labels'])
    mvf.metadata['sourceformat'] = 'fasta'
    mvf.metadata.append(args.command_string)
    mvf.flavor = args.flavor
    # WRITE MVF HEADER
    mvf.write_data(mvf.get_header())
    mvfentries = []
    nentry = 0
    mvf_alleles = {}
    for cind, contig in enumerate(fcontigs):
        for pos in range(mvf.contig_data[cind + 1]['length']):
            mvf_alleles = encode_mvfstring(
                ''.join(samp not in fasta[contig] and '-'
                        or pos >= fasta[contig][samp][0] and '-'
                        or fasta[contig][samp][1][pos] for samp in fsamples))
            if mvf_alleles:
                if args.flavor == 'dna':
                    mvf_alleles = ''.join(
                        ["X" if x in 'NX' else x for x in mvf_alleles])
                mvfentries.append((cind, pos + 1, (mvf_alleles, )))
                nentry += 1
                if nentry == args.write_buffer:
                    mvf.write_entries(mvfentries, encoded=True)
                    mvfentries = []
                    nentry = 0
    if mvfentries:
        mvf.write_entries(mvfentries)
        mvfentries = []
    return ''
Exemplo n.º 7
0
def vcf2mvf(args=None):
    """Main method for vcf2mvf"""
    sepchars = dict([("TAB", "\t"), ("SPACE", " "), ("DBLSPACE", "  "),
                     ("COMMA", ","), ("MIXED", None)])
    args.fieldsep = sepchars[args.field_sep]
    # ESTABLISH VCF
    args.qprint("Opening input VCF: {}".format(args.vcf))
    vcf = VariantCallFile(args.vcf, indexcontigs=(not args.no_autoindex))
    # ESTABLISH MVF
    args.qprint("Establishing output MVF: {}".format(args.out))
    mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    mvf.notes.append(args.command_string)
    mvf.metadata['mvfversion'] = args.versionx
    # PROCESS CONTIG INFO
    args.qprint("Processing VCF headers.")
    vcfcontigs = vcf.metadata['contigs'].copy()
    args.qprint("{} contigs found.".format(len(vcfcontigs)))
    contig_translate = {}
    if args.contig_ids:
        for cid, cvcf, cmvf in (x.split(';') for x in args.contig_ids):
            try:
                cid = int(cid)
            except ValueError:
                pass
            assert cvcf in [vcfcontigs[x]['label'] for x in vcfcontigs]
            for vid in vcfcontigs:
                if vcfcontigs[vid]['label'] == cvcf:
                    contig_translate[cvcf] = [cid, cmvf]
                    if cid in mvf.metadata['contigs']:
                        raise RuntimeError(
                            'Contig id {} is not unique'.format(cid))
                    mvf.metadata['contigs'][cid] = vcfcontigs[vid].copy()
                    if cmvf in mvf.get_contig_labels():
                        raise RuntimeError(
                            'Contig label {} is not unique'.format(cmvf))
                    mvf.metadata['contigs'][cid]['label'] = cmvf[:]
    mvf.reset_max_contig()
    mvf.max_contig_index -= 1
    args.qprint("Processing contigs.")
    static_contig_ids = list(mvf.get_contig_ids())
    for vcid in vcfcontigs:
        vlabel = vcfcontigs[vcid]['label']
        if vlabel not in static_contig_ids:
            newindex = mvf.get_next_contig_index()
            if ((is_int(vlabel) or len(vlabel) < 3)
                    and vlabel not in static_contig_ids):
                newid = vlabel[:]
            else:
                newid = str(newindex)
            mvf.contig_indices.append(newindex)
            mvf.contig_ids.append(newid)
            mvf.contig_data[newindex] = vcfcontigs[vcid].copy()
            static_contig_ids.append(newid)
            contig_translate[vlabel] = [newindex, vlabel]
    mvf.reset_max_contig()
    new_contigs = [(x, mvf.contig_data[x]['label'])
                   for x in mvf.contig_indices]
    if args.skip_contig_label_check is False:
        args.qprint("Checking contigs for label/id overlap errors.")
        xids = [x[0] for x in new_contigs]
        xlabels = [x[1] for x in new_contigs]
        xintersect = set(xids).intersection(xlabels)
        if xintersect:
            for i, (newid, newlabel) in enumerate(new_contigs):
                if i % 100 == 0:
                    args.qprint("{} contigs processed".format(i))
                if newid in xlabels[:i] or newid in xlabels[i + 1:]:
                    # if newid in xlabels:
                    # if xlabels.index(newid) != i:
                    raise RuntimeError("Error contig id {} is the same as"
                                       " the label for another contig"
                                       " ({})".format(newid,
                                                      xlabels.index(newid)))
                if newlabel in xids[:i] or newlabel in xids[i + 1:]:
                    # if newlabel in xids:
                    # if xids.index(newlabel) != i:
                    raise RuntimeError("Error contig label {} is the same"
                                       "as the id for another contig"
                                       "({})".format(newlabel,
                                                     xids.index(newlabel)))
    # PROCESS SAMPLE INFO
    args.qprint("Processing samples.")
    samplelabels = [args.ref_label] + vcf.metadata['samples'][:]
    if args.alleles_from:
        args.alleles_from = args.alleles_from.split(':')
        samplelabels += args.alleles_from
    if args.sample_replace:
        newsample = [
            x.split(':') if ':' in tuple(x) else tuple([x, x])
            for x in args.sample_replace
        ]
        unmatched = list(enumerate(samplelabels))
        for old, new in newsample:
            labelmatched = False
            for j, (i, name) in enumerate(unmatched):
                if old in name:
                    samplelabels[i] = new
                    labelmatched = j
                    break
            if labelmatched is not False:
                del unmatched[labelmatched]
    mvf.sample_indices = list(range(len(samplelabels)))
    mvf.sample_ids = samplelabels[:]
    for i, label in enumerate(samplelabels):
        mvf.sample_data[i] = {'id': label}
    mvf.metadata['ncol'] = len(mvf.sample_ids)
    mvf.max_sample_index = len(mvf.sample_ids)
    mvf.metadata['sourceformat'] = vcf.metadata['sourceformat']
    # WRITE MVF HEADER
    mvf.write_data(mvf.get_header())
    mvfentries = []
    nentry = 0
    args.qprint("Processing VCF entries.")
    for vcfrecord in vcf.iterentries(args):
        mvfstring = ''.join(vcfrecord['genotypes'])
        if args.filter_nonref_empty is True:
            if all(x in 'Xx-?' for x in mvfstring[1:]):
                continue
        mvf_alleles = encode_mvfstring(mvfstring)
        if args.out_flavor in ('dnaqual', ):
            qual_alleles = encode_mvfstring(''.join(vcfrecord['qscores']))
        if mvf_alleles:
            mvfentries.append(
                (contig_translate.get(vcfrecord['contig'])[0],
                 vcfrecord['coord'],
                 ((mvf_alleles,
                   qual_alleles) if args.out_flavor in ('dnaqual', ) else
                  (mvf_alleles, ))))
            nentry += 1
            if nentry == args.line_buffer:
                mvf.write_entries(mvfentries, encoded=True)
                mvfentries = []
                nentry = 0
    if mvfentries:
        mvf.write_entries(mvfentries)
        mvfentries = []
    return ''
Exemplo n.º 8
0
def filter_mvf(args):
    """Main method"""
    if args.more_help is True:
        modulehelp()
        sys.exit()
    if args.mvf is None and args.test is None:
        raise RuntimeError("No input file specified with --mvf")
    if args.out is None and args.test is None:
        raise RuntimeError("No output file specified with --out")
    # Establish Input MVF
    if args.test is not None:
        ncol = args.test_nchar or len(args.test.split()[1])
    else:
        mvf = MultiVariantFile(args.mvf, 'read')
        ncol = mvf.metadata['ncol']
    # Create Actionset
    if args.labels:
        labels = mvf.get_sample_labels()[:]
        for i in range(len(args.actions)):
            action = args.actions[i]
            arr = action.split(':')
            if arr[0] in ('columns', 'collapsepriority', 'collapsemerge',
                          'allelegroup', 'notmultigroup'):
                for j in range(1, len(arr)):
                    arr[j] = ','.join(
                        [str(labels.index(x)) for x in arr[j].split(',')])
            args.actions[i] = ':'.join(arr)
    actionset = build_actionset(args.actions, ncol)
    # TESTING MODE
    if args.test:
        loc, alleles = args.test.split()
        linefail = False
        transformed = False
        # invar = invariant (single character)
        # refvar (all different than reference, two chars)
        # onecov (single coverage, + is second character)
        # onevar (one variable base, + is third character)
        # full = full alleles (all chars)
        if args.verbose:
            print(alleles)
        linetype = get_linetype(alleles)
        sys.stdout.write("MVF Encoding type '{}' detected\n".format(linetype))
        for actionname, actiontype, actionfunc, actionarg in actionset:
            sys.stdout.write("Applying action {} ({}): ".format(
                actionname, actiontype))
            if actiontype == 'filter':
                if not actionfunc(alleles, linetype):
                    linefail = True
                    sys.stdout.write("Filter Fail\n")
                    break
                else:
                    sys.stdout.write("Filter Pass\n")
            elif actiontype == 'transform':
                transformed = True
                alleles = actionfunc(alleles, linetype)
                linetype = get_linetype(alleles)
                if linetype == 'empty':
                    linefail = True
                    sys.stdout.write("Transform removed all alleles\n")
                    break
                else:
                    sys.stdout.write("Transform result {}\n".format(alleles))
            elif actiontype == 'location':
                loc = loc.split(':')
                loc[1] = int(loc[1])
                if actionfunc(loc) is False:
                    linefail = True
                    sys.stdout.write("Location Fail\n")
                    break
                else:
                    sys.stdout.write("Location Pass\n")
        if linefail is False:
            if transformed:
                if linetype == 'full':
                    alleles = encode_mvfstring(alleles)
                if alleles:
                    test_output = "{}\t{}\n".format(loc, alleles)
                    sys.stdout.write("Final output = {}\n".format(test_output))
                else:
                    sys.stdout.write("Transform removed all alleles\n")
            else:
                sys.stdout.write("No changes applied\n")
                sys.stdout.write("Final output = {}\n".format(args.test))
        sys.exit()
    # MAIN MODE
    # Set up file handler
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.metadata = deepcopy(mvf.metadata)
    # reprocess header if actions are used that filter columns
    if any(x == y[0] for x in ('columns', 'collapsepriority', 'collapsemerge')
           for y in actionset):
        if args.labels:
            labels = outmvf.metadata['labels'][:]
        else:
            labels = [x for x in outmvf.metadata['samples']]
        for actionname, actiontype, actionfunc, actionarg in actionset:
            if actionname == 'columns':
                labels = [labels[x] for x in actionarg[0]]
            elif actionname in ('collapsepriority', 'collapsemerge'):
                labels = [
                    labels[x] for x in range(len(labels))
                    if x not in actionarg[0][1:]
                ]
        if args.labels:
            oldindices = mvf.get_sample_indices(labels)
        else:
            oldindices = labels[:]
        newsamples = {}
        for i, _ in enumerate(labels):
            newsamples[i] = mvf.metadata['samples'][oldindices[i]]
        outmvf.metadata['samples'] = newsamples.copy()
        outmvf.metadata['labels'] = labels[:]
    outmvf.write_data(outmvf.get_header())
    # End header editing
    linebuffer = []
    nbuffer = 0
    for chrom, pos, allelesets in mvf.iterentries(decode=False):
        linefail = False
        transformed = False
        # invar = invariant (single character)
        # refvar (all different than reference, two chars)
        # onecov (single coverage, + is second character)
        # onevar (one variable base, + is third character)
        # full = full alleles (all chars)
        alleles = allelesets[0]
        linetype = get_linetype(alleles)
        if linetype == 'empty':
            continue
        if args.verbose is True:
            sys.stdout.write(" {} {}".format(alleles, linetype))
        for actionname, actiontype, actionfunc, actionargs in actionset:
            if actiontype == 'filter':
                if not actionfunc(alleles, linetype):
                    linefail = True
            elif actiontype == 'transform':
                transformed = True
                alleles = actionfunc(alleles, linetype)
                linetype = get_linetype(alleles)
                if linetype == 'empty':
                    linefail = True
            elif actiontype == 'location':
                if actionfunc([chrom, pos]) is False:
                    linefail = True
            if linefail:
                break
        if linefail is False:
            if transformed:
                if linetype == 'full':
                    alleles = mvf.encode(alleles)
                if not alleles:
                    linefail = True
            nbuffer += 1
            linebuffer.append((chrom, pos, (alleles, )))
            if args.verbose:
                sys.stdout.write("{}\n".format(alleles))
            if nbuffer == args.line_buffer:
                outmvf.write_entries(linebuffer)
                linebuffer = []
                nbuffer = 0
        elif args.verbose:
            sys.stdout.write("FAIL\n")
    if linebuffer:
        outmvf.write_entries(linebuffer)
        linebuffer = []
    return ''
Exemplo n.º 9
0
def filter_mvf(args):
    """Main method"""
    args.qprint("Running FilterMVF")
    if args.more_help is True:
        modulehelp()
        sys.exit()
    if args.mvf is None and args.test is None:
        raise RuntimeError("No input file specified with --mvf")
    if args.out is None and args.test is None:
        raise RuntimeError("No output file specified with --out")
    # Establish Input MVF
    if args.test is not None:
        ncol = args.test_nchar or len(args.test.split()[1])
    else:
        mvf = MultiVariantFile(args.mvf, 'read')
        ncol = mvf.metadata['ncol']
    args.qprint("Input MVF read with {} columns.".format(ncol))
    # Create Actionset
    if args.labels:
        for i in range(len(args.actions)):
            action = args.actions[i]
            arr = action.split(':')
            if arr[0] in ('collapsepriority', 'collapsemerge'):
                arr[1] = ','.join([
                    str(mvf.sample_id_to_index[x])
                    for x in arr[1].split(',')])
            if arr[0] in ('columns', 'allelegroup', 
                          'notmultigroup', 'reqsample'):
                for j in range(1, len(arr)):
                    arr[j] = ','.join([
                        str(mvf.sample_id_to_index[x])
                        for x in arr[j].split(',')])
            args.actions[i] = ':'.join(arr)
    removed_columns = set([])
    for i in range(len(args.actions)):
        action = args.actions[i]
        arr = action.split(':')
        if arr[0] in ('collapsepriority', 'collapsemerge'):
            tmp_arr = arr[1][:]
            arr[1] = ','.join([
                str(int(x) - len([y for y in removed_columns if y < int(x)]))
                for x in arr[1].split(',')])
            removed_columns.update([int(x) for x in tmp_arr.split(',')[1:]])
            print(arr)
            print(removed_columns)
        if arr[0] in ('columns', 'allelegroup', 
                      'notmultigroup', 'reqsample'):
            for j in range(1, len(arr)):
                arr[j] = ','.join([
                    str(int(x) - len([y for y in removed_columns if y < int(x)]))
                    for x in arr[j].split(',')])
        args.actions[i] = ':'.join(arr)
            
            
    actionset = build_actionset(args.actions, ncol)
    args.qprint("Actions established.")
    args.qprint(actionset)
    # TESTING MODE
    if args.test:
        loc, alleles = args.test.split()
        linefail = False
        transformed = False
        # invar = invariant (single character)
        # refvar (all different than reference, two chars)
        # onecov (single coverage, + is second character)
        # onevar (one variable base, + is third character)
        # full = full alleles (all chars)
        if args.verbose:
            print(alleles)
        linetype = get_linetype(alleles)
        sys.stdout.write("MVF Encoding type '{}' detected\n".format(linetype))
        for actionname, actiontype, actionfunc, actionarg in actionset:
            sys.stdout.write("Applying action {} ({}): ".format(
                actionname, actiontype))
            if actiontype == 'filter':
                if not actionfunc(alleles, linetype):
                    linefail = True
                    sys.stdout.write("Filter Fail\n")
                    break
                sys.stdout.write("Filter Pass\n")
            elif actiontype == 'transform':
                transformed = True
                alleles = actionfunc(alleles, linetype)
                linetype = get_linetype(alleles)
                if linetype == 'empty':
                    linefail = True
                    sys.stdout.write("Transform removed all alleles\n")
                    break
                sys.stdout.write("Transform result {}\n".format(alleles))
            elif actiontype == 'location':
                loc = loc.split(':')
                loc[1] = int(loc[1])
                if actionfunc(loc) is False:
                    linefail = True
                    sys.stdout.write("Location Fail\n")
                    break
                sys.stdout.write("Location Pass\n")
        if linefail is False:
            if transformed:
                if linetype == 'full':
                    alleles = encode_mvfstring(alleles)
                if alleles:
                    test_output = "{}\t{}\n".format(loc, alleles)
                    sys.stdout.write("Final output = {}\n".format(
                        test_output))
                else:
                    sys.stdout.write("Transform removed all alleles\n")
            else:
                sys.stdout.write("No changes applied\n")
                sys.stdout.write("Final output = {}\n".format(args.test))
        sys.exit()
    # MAIN MODE
    # Set up file handler
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.copy_headers_from(mvf)

    removed_indices = set([])
    # reprocess header if actions are used that filter columns
    if any(x == y[0] for x in ('columns', 'collapsepriority', 'collapsemerge')
           for y in actionset):
        for actionname, actiontype, actionfunc, actionarg in actionset:
            if actionname == 'columns':
                if args.labels:
                    oldindices = [outmvf.sample_id_to_index[int(x)]
                                  for x in actionarg[0]]
                else:
                    oldindices = [int(x) for x in actionarg[0]]
            elif actionname in ('collapsepriority', 'collapsemerge'):
                actionarg[0] = [x - len([y for y in removed_indices if y < x])
                                 for x in actionarg[0]]
                oldindices = [x for x in outmvf.sample_indices
                              if x not in actionarg[0][1:]]
            outmvf.sample_ids = outmvf.get_sample_ids(oldindices)
            outmvf.sample_data = dict(
                (i, outmvf.sample_data[oldindices[i]])
                for i, _ in enumerate(oldindices))

            if actionname in ('collapsepriority', 'collapsemerge'):
                if len(actionarg) == 2:
                    outmvf.sample_data[actionarg[0][0]]['id'] = actionarg[1][0]
                    outmvf.sample_ids[actionarg[0][0]] = actionarg[1][0]
            outmvf.sample_indices = list(range(len(oldindices)))
    outmvf.metadata['ncol'] = len(outmvf.sample_indices)
    outmvf.notes.append(args.command_string)
    outmvf.write_data(outmvf.get_header())
    args.qprint("Output MVF established.")
    # End header editing
    linebuffer = []
    nbuffer = 0
    args.qprint("Processing Entries.")
    write_total = 0
    for chrom, pos, allelesets in mvf.iterentries(decode=False):
        linefail = False
        transformed = False
        # invar = invariant (single character)
        # refvar (all different than reference, two chars)
        # onecov (single coverage, + is second character)
        # onevar (one variable base, + is third character)
        # full = full alleles (all chars)
        alleles = allelesets[0]
        linetype = get_linetype(alleles)
        if linetype == 'empty':
            continue
        if args.verbose is True:
            sys.stdout.write(" {} {} ".format(alleles, linetype))
        for actionname, actiontype, actionfunc, _ in actionset:
            if actiontype == 'filter':
                linefail = not actionfunc(alleles, linetype)
            elif actiontype == 'transform':
                transformed = True
                alleles = actionfunc(alleles, linetype)
                linetype = get_linetype(alleles)
                linefail = linetype == 'empty'
            elif actiontype == 'location':
                linefail = not actionfunc([chrom, pos])
            if linefail:
                break
        if linefail is False:
            if transformed:
                if linetype == 'full':
                    alleles = mvf.encode(alleles)
                if not alleles:
                    linefail = True
            nbuffer += 1
            linebuffer.append((chrom, pos, (alleles,)))
            if args.verbose:
                sys.stdout.write("{}\n".format(alleles))
            if nbuffer == args.line_buffer:
                write_total += args.line_buffer
                args.qprint("{} entries written. Total written: {}.".format(
                    args.line_buffer, write_total))
                outmvf.write_entries(linebuffer)
                linebuffer = []
                nbuffer = 0
        elif args.verbose:
            sys.stdout.write("FAIL\n")
    if linebuffer:
        outmvf.write_entries(linebuffer)
        write_total += len(linebuffer)
        args.qprint("{} entries written. Total written: {}.".format(
            args.line_buffer, write_total))
        linebuffer = []
    return ''
Exemplo n.º 10
0
def mvf_join(args):
    """Main method"""
    concatmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    # Copy the first file's metadata
    if args.main_header_file:
        if args.main_header_file not in args.mvf:
            raise RuntimeError("{} not found in files".format(
                args.main_header_file))
        else:
            args.main_header_file = args.mvf.index(args.main_header_file)
    else:
        args.main_header_file = 0
    first_mvf = MultiVariantFile(args.mvf[args.main_header_file], 'read')
    concatmvf.metadata = first_mvf.metadata.copy()
    # Open each MVF file, read headers to make unified header
    transformers = []
    for mvfname in args.mvf:
        # This will create a dictionary of samples{old:new}, contigs{old:new}
        transformer = MvfTransformer()
        mvf = MultiVariantFile(mvfname, 'read')
        for i, label in enumerate(mvf.get_sample_labels()):
            if label not in concatmvf.get_sample_labels():
                concatmvf.metadata['labels'].append(label)
                concatmvf.metadata['samples'][
                    concatmvf.metadata['labels'].index(label)] = {
                        'label': label
                    }
            if concatmvf.metadata['labels'].index(label) != i:
                transformer.set_label(
                    i, concatmvf.metadata['labels'].index(label))
        for contigid, contigdata in iter(mvf.metadata['contigs'].items()):
            if contigdata['label'] not in [
                    concatmvf.metadata['contigs'][x]['label']
                    for x in concatmvf.metadata['contigs']
            ]:
                newid = (contigid not in concatmvf.metadata['contigs']
                         and contigid or concatmvf.get_next_contig_id())
                concatmvf.metadata['contigs'][newid] = contigdata
            else:
                for concatid, concatdata in (
                        concatmvf.metadata['contigs'].items()):
                    if contigdata['label'] == concatdata['label']:
                        newid = concatid
                        break
            if newid != contigid:
                transformer.set_contig(contigid, newid)
        transformers.append(transformer)
    # Write output header
    concatmvf.write_data(concatmvf.get_header())
    # Now loop through each file
    entries = []
    nentries = 0
    for ifile, mvfname in enumerate(args.mvf):
        if not args.quiet:
            sys.stderr.write("Processing {} ...\n".format(mvfname))
        transformer = transformers[ifile]
        mvf = MultiVariantFile(mvfname, 'read')
        for contigid, pos, allelesets in mvf.iterentries(decode=False,
                                                         quiet=args.quiet):
            if transformer.labels:
                allelesets = [mvf.decode(x) for x in allelesets]
                for j, alleles in enumerate(allelesets):
                    allelesets[j] = concatmvf.encode(''.join([
                        x in transformer.labels
                        and alleles[transformer.labels[x]] or alleles[x]
                        for x in range(len(alleles))
                    ]))
            if transformer.contigs:
                contigid = (contigid in transformer['contigs']
                            and transformer['contigs'][contigid] or contigid)
            entries.append((contigid, pos, allelesets))
            nentries += 1
            if nentries == args.line_buffer:
                concatmvf.write_entries(entries)
                entries = []
                nentries = 0
        if entries:
            concatmvf.write_entries(entries)
            entries = []
            nentries = 0
        if not args.quiet:
            sys.stderr.write("done\n")
    return ''
Exemplo n.º 11
0
def translate_mvf(args):
    """Main method"""
    args.qprint("Running TranslateMVF")
    if args.gff:
        args.qprint("Reading and Indexing MVF.")
    else:
        args.qprint("Reading MVF.")
    mvf = MultiVariantFile(args.mvf, 'read', contigindex=bool(args.gff))
    if mvf.flavor != 'dna':
        raise RuntimeError("MVF must be flavor=dna to translate")
    if args.gff:
        args.qprint("Processing MVF Index File.")
        mvf.read_index_file()
        args.qprint("GFF processing start.")
        gff_genes, gene_order = parse_gff_exome(args)
        args.qprint("GFF processed.")
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.copy_headers_from(mvf)
    outmvf.contig_data = dict(
         (
                i, dict((y, z)
                                       for (y, z) in gff_genes[x].items()
                                       if y not in ('cds', )))
                              for (i, x) in enumerate(gene_order))
    outmvf.contig_indices = list(range(len(gene_order)))
    outmvf.contig_ids = [gff_genes[x]['id']
                         for x in gene_order]
    outmvf.contig_labels = [gff_genes[x]['label']
                            for x in gene_order]
    outmvf.flavor = args.output_data
    outmvf.metadata.notes.append(args.command_string)
    outmvf.write_data(outmvf.get_header())
    args.qprint("Output MVF Established.")
    entrybuffer = []
    nentry = 0
    pos = None
    if not args.gff:
        args.qprint("No GFF used, translating sequences as pre-aligned in "
                    "coding frame.")
        inputbuffer = []
        current_contig = ''
        for contigid, pos, allelesets in mvf.iterentries(decode=False):
            if current_contig == '':
                current_contig = contigid[:]
            if contigid == current_contig:
                inputbuffer.append((pos, allelesets))
            else:
                for _, amino_acids, alleles in iter_codons(
                        inputbuffer, mvf):
                    if all([x in '-X' for x in amino_acids]):
                        continue
                    if args.output_data == 'protein':
                        entrybuffer.append(
                            (current_contig, pos, (amino_acids,)))
                    else:
                        entrybuffer.append((
                            current_contig, pos, (
                                amino_acids, alleles[0],
                                alleles[1], alleles[2])))
                    nentry += 1
                    if nentry == args.line_buffer:
                        outmvf.write_entries(entrybuffer)
                        entrybuffer = []
                        nentry = 0
                inputbuffer = [(pos, allelesets)]
                current_contig = contigid[:]
        if inputbuffer:
            for _, amino_acids, alleles in iter_codons(
                    inputbuffer, outmvf):
                if all([x in '-X' for x in amino_acids]):
                    continue
                if args.output_data == 'protein':
                    entrybuffer.append(
                        (current_contig, pos, (amino_acids,)))
                else:
                    entrybuffer.append((
                        current_contig, pos, (
                            amino_acids, alleles[0],
                            alleles[1], alleles[2])))
                nentry += 1
                if nentry == args.line_buffer:
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    else:
        running_gene_index = -1
        for igene, gene in enumerate(gene_order):
            xcontiglabel = gff_genes[gene]['contig']
            xcontig = mvf.get_contig_indices(
                labels=gff_genes[gene]['contig'])
            if xcontig is None:
                print("Warning: contig {} not found".format(
                    gff_genes[gene]['contig']))
            xcontigid = mvf.get_contig_ids(indices=xcontig)[0]
            min_gene_coord = gff_genes[gene]['cds'][0][0]
            max_gene_coord = gff_genes[gene]['cds'][-1][1]
            mvf_entries = {}
            if not igene % 100:
                args.qprint("Processing gene {} on {}".format(
                    gene, xcontiglabel))
            for contigid, pos, allelesets in mvf.itercontigentries(
                    xcontig, decode=False):
                if pos < min_gene_coord:
                    continue
                if pos > max_gene_coord:
                    break
                mvf_entries[pos] = allelesets[0]
            reverse_strand = gff_genes[gene]['strand'] == '-'
            coords = []
            running_gene_index += 1
            for elem in gff_genes[gene]['cds']:
                coords.extend(list(range(elem[0], elem[1] + 1)))
            if reverse_strand:
                coords = coords[::-1]
            for codoncoord in range(0, len(coords), 3):
                alleles = tuple(mvf_entries.get(x, '-')
                                for x in coords[codoncoord:codoncoord + 3])
                if len(alleles) < 3:
                    alleles = tuple(list(alleles) + ['-'] * (3 - len(alleles)))
                if all(len(x) == 1 for x in alleles):
                    if reverse_strand:
                        alleles = tuple(
                            MLIB.complement_bases[x] for x in alleles)
                    decoded_alleles = alleles
                    amino_acids = translate_single_codon(''.join(alleles))
                else:
                    if reverse_strand is True:
                        decoded_alleles = tuple(tuple(MLIB.complement_bases[y]
                                                      for y in mvf.decode(x))
                                                for x in alleles)
                        alleles = tuple(outmvf.encode(''.join(x))
                                        for x in decoded_alleles)
                    else:
                        decoded_alleles = tuple(mvf.decode(x) for x in alleles)
                    amino_acids = tuple(translate_single_codon(''.join(x))
                                        for x in zip(*decoded_alleles))
                    amino_acids = outmvf.encode(''.join(amino_acids))
                if args.output_data == 'protein':
                    entrybuffer.append((
                        (
                            xcontigid
                            if args.retain_contigs
                            else running_gene_index
                        ),
                        (
                            coords[codoncoord]
                            if args.retain_coords
                            else codoncoord
                        ),
                        (
                            amino_acids,
                        )
                    ))
                elif args.output_data == 'codon':
                    entrybuffer.append((
                        (
                            xcontigid
                            if args.retain_contigs
                            else running_gene_index
                        ),
                        (
                            coords[codoncoord]
                            if args.retain_coords
                            else codoncoord
                        ),
                        (
                            amino_acids,
                            alleles[0],
                            alleles[1],
                            alleles[2]
                        )
                    ))
                elif args.output_data == 'dna':
                    for j, elem in enumerate(
                            range(codoncoord,
                                  min(codoncoord + 3, len(coords)))):
                        entrybuffer.append((
                            (
                                xcontigid
                                if args.retain_contigs
                                else running_gene_index
                            ),
                            (
                                coords[elem]
                                if args.retain_coords
                                else elem + 1
                            ),
                            (
                                alleles[j],
                            )
                        ))
                nentry += 1
                if nentry >= args.line_buffer:
                    args.qprint("Writing a block of {} entries.".format(
                        args.line_buffer))
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
        if entrybuffer:
            outmvf.write_entries(entrybuffer)
            entrybuffer = []
            nentry = 0
    return ''
Exemplo n.º 12
0
def legacy_translate_mvf(args):
    """Main method"""
    args.qprint("Running LegacyTranslateMVF")
    if args.gff:
        args.qprint("Reading and Indexing MVF.")
    else:
        args.qprint("Reading MVF.")
    mvf = MultiVariantFile(args.mvf, 'read', contigindex=bool(args.gff))
    if mvf.flavor != 'dna':
        raise RuntimeError("MVF must be flavor=dna to translate")
    if args.gff:
        args.qprint("Processing MVF Index File.")
        mvf.read_index_file()
        args.qprint("GFF processing start.")
        gff = parse_gff_legacy_translate(
            args.gff, args,
            parent_gene_pattern=args.parent_gene_pattern)
        args.qprint("GFF processed.")
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.copy_headers_from(mvf)
    outmvf.flavor = args.output_data
    outmvf.write_data(outmvf.get_header())
    args.qprint("Output MVF Established.")
    entrybuffer = []
    nentry = 0
    pos = None
    if not args.gff:
        args.qprint("No GFF used, translating sequences as pre-aligned in "
                    "coding frame.")
        inputbuffer = []
        current_contig = ''
        for contigid, pos, allelesets in mvf.iterentries(decode=False):
            if current_contig == '':
                current_contig = contigid[:]
            if contigid == current_contig:
                inputbuffer.append((pos, allelesets))
            else:
                for _, amino_acids, alleles in iter_codons(
                        inputbuffer, mvf):
                    if all([x in '-X' for x in amino_acids]):
                        continue
                    if args.output_data == 'protein':
                        entrybuffer.append(
                            (current_contig, pos, (amino_acids,)))
                    else:
                        entrybuffer.append((
                            current_contig, pos, (
                                amino_acids, alleles[0],
                                alleles[1], alleles[2])))
                    nentry += 1
                    if nentry == args.line_buffer:
                        outmvf.write_entries(entrybuffer)
                        entrybuffer = []
                        nentry = 0
                inputbuffer = [(pos, allelesets)]
                current_contig = contigid[:]
        if inputbuffer:
            for _, amino_acids, alleles in iter_codons(
                    inputbuffer, outmvf):
                if all([x in '-X' for x in amino_acids]):
                    continue
                if args.output_data == 'protein':
                    entrybuffer.append(
                        (current_contig, pos, (amino_acids,)))
                else:
                    entrybuffer.append((
                        current_contig, pos, (
                            amino_acids, alleles[0],
                            alleles[1], alleles[2])))
                nentry += 1
                if nentry == args.line_buffer:
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    else:
        args.qprint("Indexing GFF gene names.")
        # mvfid_to_gffname = outmvf.get_contig_reverse_dict()
        for xcontig in outmvf.get_contig_indices():
            mvf_entries = {}
            xcontiglabel = outmvf.get_contig_labels(indices=xcontig)[0]
            xcontigid = outmvf.get_contig_ids(indices=xcontig)[0]
            if xcontiglabel not in gff:
                if args.verbose:
                    print(
                        ("No entries in GFF, "
                         "skipping contig: index:{} id:{} label:{}").format(
                             xcontig, xcontigid, xcontiglabel))
                continue
            if not xcontig % 100:
                args.qprint("Processing contig: {} {}".format(
                    xcontigid, xcontiglabel))
            for contigid, pos, allelesets in mvf.itercontigentries(
                    xcontig, decode=False):
                mvf_entries[pos] = allelesets[0]
            for coords in sorted(gff[xcontiglabel]):
                reverse_strand = coords[3] == '-'
                alleles = (tuple(mvf_entries.get(x, '-')
                                 for x in coords[2::-1])
                           if reverse_strand is True
                           else tuple(mvf_entries.get(x, '-')
                                      for x in coords[0:3]))
                if all(len(x) == 1 for x in alleles):
                    if reverse_strand:
                        alleles = tuple(
                            MLIB.complement_bases[x] for x in alleles)
                    decoded_alleles = alleles
                    amino_acids = translate_single_codon(''.join(alleles))
                else:
                    if reverse_strand is True:
                        decoded_alleles = tuple(tuple(MLIB.complement_bases[y]
                                                      for y in mvf.decode(x))
                                                for x in alleles)
                        alleles = tuple(outmvf.encode(''.join(x))
                                        for x in decoded_alleles)
                    else:
                        decoded_alleles = tuple(mvf.decode(x) for x in alleles)
                    amino_acids = tuple(translate_single_codon(''.join(x))
                                        for x in zip(*decoded_alleles))
                    # print("aminx", amino_acids)
                    amino_acids = outmvf.encode(''.join(amino_acids))
                # if all(x in '-X' for x in amino_acids):
                #    continue
                # print("amino", amino_acids)
                # print("translated", amino_acids, alleles)
                if args.output_data == 'protein':
                    entrybuffer.append((xcontig, coords[0], (amino_acids,)))
                else:
                    entrybuffer.append((
                        xcontigid, coords[0], (
                            amino_acids, alleles[0], alleles[1], alleles[2])))
                nentry += 1
                if nentry >= args.line_buffer:
                    args.qprint("Writing a block of {} entries.".format(
                        args.line_buffer))
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    if entrybuffer:
        outmvf.write_entries(entrybuffer)
        entrybuffer = []
        nentry = 0
    return ''
Exemplo n.º 13
0
def merge_mvf(args):
    """Main method"""
    args.qprint("Running MergeMVF")
    if any(fpath.endswith('.gz') for fpath in args.mvf):
        print("WARNING! Running MergeMVF with gzipped input files is "
              "extremely slow and strongly discouraged.")
    concatmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    # Copy the first file's metadata
    args.qprint("Reading First File and Establishing Output")
    if args.main_header_file:
        if args.main_header_file not in args.mvf:
            raise RuntimeError("{} not found in files".format(
                args.main_header_file))
        args.main_header_file = args.mvf.index(args.main_header_file)
    else:
        args.main_header_file = 0
    first_mvf = MultiVariantFile(args.mvf[args.main_header_file], 'read')
    concatmvf.copy_header(first_mvf)
    # Open each MVF file, read headers to make unified header
    transformers = []
    mvfmetadata = []
    inputfiles = []
    for mvfname in args.mvf:
        args.qprint("Reading headers from {}".format(mvfname))
        # This will create a dictionary of samples{old:new}, contigs{old:new}
        args.qprint("Processing Headers and Indexing: {}".format(mvfname))
        transformer = MvfTransformer()
        mvf = MultiVariantFile(mvfname,
                               'read',
                               contigindex=(not args.skip_index))
        if args.skip_index:
            mvf.read_index_file()
        mvf.reset_max_contig()
        mvfmetadata.append(mvf.metadata)
        for i, sid in enumerate(mvf.get_sample_ids()):
            if sid not in concatmvf.get_sample_ids():
                new_sindex = concatmvf.max_sample_index + 0
                concatmvf.max_sample_index += 1
                concatmvf.sample_indices.append(new_sindex)
                concatmvf.sample_ids.append(sid)
                concatmvf.sample_data[new_sindex] = {}
                concatmvf.sample_data[new_sindex]['id'] = sid
                concatmvf.sample_id_to_index[sid] = new_sindex
            transformer.set_label(i, concatmvf.sample_id_to_index[sid])
        for cindex in mvf.contig_indices:
            if (mvf.contig_data[cindex]['label']
                    not in concatmvf.contig_label_to_index):
                new_cindex = (mvf.contig_data[cindex]['id']
                              if mvf.contig_data[cindex]['id']
                              not in concatmvf.contig_ids else
                              concatmvf.get_next_contig_index())
                concatmvf.contig_data[new_cindex] = (
                    mvf.contig_data[cindex].copy())
            else:
                new_cindex = concatmvf.contig_label_to_index[
                    mvf.contig_data[cindex]['label']]
            transformer.set_contig(cindex, new_cindex)
        transformers.append(transformer)
        inputfiles.append(mvf)
    # Write output header
    args.qprint("Writing headers to merge output")
    concatmvf.reset_max_sample()
    concatmvf.notes.append(args.command_string)
    concatmvf.write_data(concatmvf.get_header())
    # Now loop through each file
    blank_entry = '-' * len(concatmvf.sample_indices)
    for cons_contig in concatmvf.contig_indices:
        contig_merged_entries = {}
        args.qprint("Merging Contig Index: {}".format(cons_contig))
        for ifile, mvffile in enumerate(inputfiles):
            if cons_contig not in transformers[ifile].contigs:
                continue
            localcontig = transformers[ifile].contigs[cons_contig]
            if 'idx' not in mvffile.contig_data[localcontig]:
                print("not found")
                continue
            for _, pos, allelesets in mvffile.itercontigentries(localcontig,
                                                                decode=True):
                if pos not in contig_merged_entries:
                    contig_merged_entries[pos] = blank_entry[:]
                for j, base in enumerate(allelesets[0]):
                    xcoord = transformers[ifile].labels_rev[j]
                    if contig_merged_entries[pos][xcoord] != '-':
                        if contig_merged_entries[pos][xcoord] == base:
                            continue
                        if base in '-X':
                            continue
                        raise RuntimeError(
                            ("Merging columns have two different bases: "
                             "{} {} {}").format(
                                 pos, contig_merged_entries[pos][xcoord],
                                 base))
                    contig_merged_entries[pos] = (
                        contig_merged_entries[pos][:xcoord] + base +
                        contig_merged_entries[pos][xcoord + 1:])
        if contig_merged_entries:
            concatmvf.write_entries(
                ((cons_contig, coord, (entry, ))
                 for coord, entry in sorted(contig_merged_entries.items())),
                encoded=False)
        args.qprint("Entries written for contig {}: {}".format(
            cons_contig, len(contig_merged_entries)))
    return ''
Exemplo n.º 14
0
def merge_mvf(args):
    """Main method"""
    args.qprint("Running MergeMVF")
    if any(fpath.endswith('.gz') for fpath in args.mvf):
        print("WARNING! Running MergeMVF with gzipped input files is "
              "extremely slow and strongly discouraged.")
    concatmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    # Copy the first file's metadata
    args.qprint("Reading First File and Establishing Output")
    if args.main_header_file:
        if args.main_header_file not in args.mvf:
            raise RuntimeError("{} not found in files".format(
                args.main_header_file))
        else:
            args.main_header_file = args.mvf.index(args.main_header_file)
    else:
        args.main_header_file = 0
    first_mvf = MultiVariantFile(args.mvf[args.main_header_file], 'read')
    concatmvf.metadata = first_mvf.metadata.copy()
    # Open each MVF file, read headers to make unified header
    transformers = []
    mvfmetadata = []
    concatmvf_reverse_contig = dict(
        (x['label'], k) for (k, x) in concatmvf.metadata['contigs'].items())
    inputfiles = []
    for mvfname in args.mvf:
        args.qprint("Reading headers from {}".format(mvfname))
        # This will create a dictionary of samples{old:new}, contigs{old:new}
        args.qprint("Processing Headers and Indexing: {}".format(mvfname))
        transformer = MvfTransformer()
        mvf = MultiVariantFile(mvfname,
                               'read',
                               contigindex=(not args.skip_index))
        if args.skip_index:
            mvf.read_index_file()
        mvf.reset_max_contig_id()
        mvfmetadata.append(mvf.metadata)
        for i, label in enumerate(mvf.get_sample_labels()):
            if label not in concatmvf.get_sample_labels():
                concatmvf.metadata['labels'].append(label)
                concatmvf.metadata['samples'][
                    concatmvf.metadata['labels'].index(label)] = {
                        'label': label
                    }
#            if concatmvf.metadata['labels'].index(label) != i:
            transformer.set_label(i, concatmvf.metadata['labels'].index(label))
        for contigid, contigdata in iter(mvf.metadata['contigs'].items()):
            if contigdata['label'] not in concatmvf_reverse_contig:
                newid = (contigid
                         if contigid not in concatmvf.metadata['contigs'] else
                         concatmvf.get_next_contig_id())
                concatmvf.metadata['contigs'][newid] = contigdata
                concatmvf_reverse_contig[contigdata['label']] = newid
            else:
                newid = concatmvf_reverse_contig[contigdata['label']]
            transformer.set_contig(contigid, newid)
        transformers.append(transformer)
        inputfiles.append(mvf)
    # Write output header
    args.qprint("Writing headers to merge output")
    concatmvf.reset_ncol()
    concatmvf.write_data(concatmvf.get_header())
    contigs = concatmvf.metadata['contigs']
    # Now loop through each file
    blank_entry = '-' * len(concatmvf.metadata['samples'])
    for current_contig in contigs:
        contig_merged_entries = {}
        args.qprint("Merging Contig: {}".format(current_contig))
        for ifile, mvffile in enumerate(inputfiles):
            if current_contig not in transformers[ifile].contigs:
                continue
            localcontig = transformers[ifile].contigs[current_contig]
            for chrom, pos, allelesets in mvffile.itercontigentries(
                    localcontig, decode=True):
                if pos not in contig_merged_entries:
                    contig_merged_entries[pos] = blank_entry[:]
                for j, base in enumerate(allelesets[0]):
                    xcoord = transformers[ifile].labels_rev[j]
                    if contig_merged_entries[pos][xcoord] != '-':
                        if contig_merged_entries[pos][xcoord] == base:
                            continue
                        if base == '-' or base == 'X':
                            continue
                        raise RuntimeError(
                            "Merging columns have two different bases: {} {} {}"
                            .format(pos, contig_merged_entries[pos][xcoord],
                                    base))
                    contig_merged_entries[pos] = (
                        contig_merged_entries[pos][:xcoord] + base +
                        contig_merged_entries[pos][xcoord + 1:])
        concatmvf.write_entries(
            ((current_contig, coord, (entry, ))
             for coord, entry in sorted(contig_merged_entries.items())),
            encoded=False)
        args.qprint("Entries written for contig {}: {}".format(
            current_contig, len(contig_merged_entries)))
    return ''
Exemplo n.º 15
0
def translate_mvf(args):
    """Main method"""
    mvf = MultiVariantFile(args.mvf, 'read')
    if mvf.flavor != 'dna':
        raise RuntimeError("MVF must be flavor=dna to translate")
    if args.gff:
        gff = parse_gff_translate(args.gff, args)
        if not args.quiet:
            print("gff_processed")
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.metadata = deepcopy(mvf.metadata)
    outmvf.flavor = args.output_data
    outmvf.write_data(outmvf.get_header())
    entrybuffer = []
    nentry = 0
    if not args.gff:
        inputbuffer = []
        current_contig = ''
        for contigid, pos, allelesets in mvf.iterentries(decode=False):
            if current_contig == '':
                current_contig = contigid[:]
            if contigid == current_contig:
                inputbuffer.append((pos, allelesets))
            else:
                for _, amino_acids, alleles in iter_codons(inputbuffer, mvf):
                    if all([x in '-X' for x in amino_acids]):
                        continue
                    if args.output_data == 'protein':
                        entrybuffer.append(
                            (current_contig, pos, (amino_acids, )))
                    else:
                        entrybuffer.append(
                            (current_contig, pos, (amino_acids, alleles[0],
                                                   alleles[1], alleles[2])))
                    nentry += 1
                    if nentry == args.line_buffer:
                        outmvf.write_entries(entrybuffer)
                        entrybuffer = []
                        nentry = 0
                inputbuffer = [(pos, allelesets)]
                current_contig = contigid[:]
        if inputbuffer:
            for _, amino_acids, alleles in iter_codons(inputbuffer, mvf):
                if all([x in '-X' for x in amino_acids]):
                    continue
                if args.output_data == 'protein':
                    entrybuffer.append((current_contig, pos, (amino_acids, )))
                else:
                    entrybuffer.append(
                        (current_contig, pos, (amino_acids, alleles[0],
                                               alleles[1], alleles[2])))
                nentry += 1
                if nentry == args.line_buffer:
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    else:
        mvf_entries = {}
        for contigid, pos, allelesets in mvf.iterentries(decode=False):
            if contigid not in mvf_entries:
                mvf_entries[contigid] = {}
            mvf_entries[contigid][pos] = allelesets[0]
        for contigname in sorted(gff):
            contigid = mvf.get_contig_ids(labels=contigname)[0]
            for coords in sorted(gff[contigname]):
                reverse_strand = False
                if coords[3] == '-':
                    reverse_strand = True
                    alleles = [
                        mvf_entries[contigid].get(x, '-')
                        for x in coords[2::-1]
                    ]
                else:
                    alleles = [
                        mvf_entries[contigid].get(x, '-') for x in coords[0:3]
                    ]
                if all(len(x) == 1 for x in alleles):
                    if reverse_strand:
                        alleles = [MLIB.complement_bases[x] for x in alleles]
                    decoded_alleles = alleles
                    amino_acids = translate(''.join(alleles))[0]
                else:
                    if reverse_strand:
                        decoded_alleles = [[
                            MLIB.complement_bases[y] for y in mvf.decode(x)
                        ] for x in alleles]
                        alleles = [
                            mvf.encode(''.join(x)) for x in decoded_alleles
                        ]
                    else:
                        decoded_alleles = [mvf.decode(x) for x in alleles]
                    amino_acids = [
                        translate(''.join(x)) for x in zip(*decoded_alleles)
                    ]
                    amino_acids = mvf.encode(''.join(
                        [x[0] for x in amino_acids]))
                if all([x in '-X' for x in amino_acids]):
                    continue
                if args.output_data == 'protein':
                    entrybuffer.append((contigid, coords[0], (amino_acids, )))
                else:
                    entrybuffer.append(
                        (contigid, coords[0], (amino_acids, alleles[0],
                                               alleles[1], alleles[2])))
                nentry += 1
                if nentry == args.line_buffer:
                    outmvf.write_entries(entrybuffer)
                    entrybuffer = []
                    nentry = 0
    if entrybuffer:
        outmvf.write_entries(entrybuffer)
        entrybuffer = []
        nentry = 0
    return ''
Exemplo n.º 16
0
def vcf2mvf(args=None):
    """Main method for vcf2mvf"""
    sepchars = dict([("TAB", "\t"), ("SPACE", " "), ("DBLSPACE", "  "),
                     ("COMMA", ","), ("MIXED", None)])
    args.fieldsep = sepchars[args.field_sep]
    # ESTABLISH VCF
    vcf = VariantCallFile(args.vcf, indexcontigs=(not args.no_autoindex))
    # ESTABLISH MVF
    mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    # PROCESS CONTIG INFO
    vcfcontigs = vcf.metadata['contigs'].copy()
    contig_translate = {}
    if args.contig_ids:
        for cid, cvcf, cmvf in (x.split(';') for x in args.contig_ids):
            try:
                cid = int(cid)
            except ValueError:
                pass
            assert cvcf in [vcfcontigs[x]['label'] for x in vcfcontigs]
            for vid in vcfcontigs:
                if vcfcontigs[vid]['label'] == cvcf:
                    contig_translate[cvcf] = [cid, cmvf]
                    if cid in mvf.metadata['contigs']:
                        raise RuntimeError(
                            'Contig id {} is not unique'.format(cid))
                    mvf.metadata['contigs'][cid] = vcfcontigs[vid].copy()
                    if cmvf in mvf.get_contig_labels():
                        raise RuntimeError(
                            'Contig label {} is not unique'.format(cmvf))
                    mvf.metadata['contigs'][cid]['label'] = cmvf[:]
    mvf.reset_max_contig_id()
    for vcid in vcfcontigs:
        vlabel = vcfcontigs[vcid]['label']
        if vlabel not in mvf.get_contig_labels():
            if ((is_int(vlabel) or len(vlabel) < 3)
                    and vlabel not in mvf.get_contig_ids()):
                newid = vlabel[:]
            else:
                newid = mvf.get_next_contig_id()
            mvf.metadata['contigs'][newid] = vcfcontigs[vcid].copy()
            contig_translate[vlabel] = [newid, vlabel]
    mvf.reset_max_contig_id()
    new_contigs = [(x, mvf.metadata['contigs'][x]['label'])
                   for x in mvf.metadata['contigs']]
    for i, (newid, newlabel) in enumerate(new_contigs):
        for j, (xid, xlabel) in enumerate(new_contigs):
            if i == j:
                continue
            if newid == xlabel:
                raise RuntimeError("Error contig id {} is the same as"
                                   " the label for another contig"
                                   " ({} {})".format(newid, xid, xlabel))
            if newlabel == xid:
                raise RuntimeError("Error contig label {} is the same"
                                   "as the id for another contig"
                                   "({} {})".format(newlabel, xid, xlabel))
    # PROCESS SAMPLE INFO
    samplelabels = [args.ref_label] + vcf.metadata['samples'][:]
    if args.alleles_from:
        args.alleles_from = args.alleles_from.split(':')
        samplelabels += args.alleles_from
    if args.sample_replace:
        newsample = [
            x.split(':') if ':' in tuple(x) else tuple([x, x])
            for x in args.sample_replace
        ]
        unmatched = [x for x in enumerate(samplelabels)]
        for old, new in newsample:
            labelmatched = False
            for j, (i, name) in enumerate(unmatched):
                if old in name:
                    samplelabels[i] = new
                    labelmatched = j
                    break
            if labelmatched is not False:
                del unmatched[labelmatched]
    mvf.metadata['labels'] = samplelabels[:]
    for i, label in enumerate(samplelabels):
        mvf.metadata['samples'][i] = {'label': label}
    mvf.metadata['ncol'] = len(mvf.metadata['labels'])
    mvf.metadata['sourceformat'] = vcf.metadata['sourceformat']
    # WRITE MVF HEADER
    mvf.write_data(mvf.get_header())
    mvfentries = []
    nentry = 0
    for vcfrecord in vcf.iterentries(args):
        # try:
        mvf_alleles = encode_mvfstring(''.join(vcfrecord['genotypes']))
        if args.out_flavor in ('dnaqual', ):
            qual_alleles = encode_mvfstring(''.join(vcfrecord['qscores']))
        if mvf_alleles:
            mvfentries.append(
                (contig_translate.get(vcfrecord['contig'])[0],
                 vcfrecord['coord'],
                 ((mvf_alleles,
                   qual_alleles) if args.out_flavor in ('dnaqual', ) else
                  (mvf_alleles, ))))
            nentry += 1
            if nentry == args.line_buffer:
                mvf.write_entries(mvfentries, encoded=True)
                mvfentries = []
                nentry = 0
        # except Exception as exception:
    if mvfentries:
        mvf.write_entries(mvfentries)
        mvfentries = []
    return ''