예제 #1
0
def read_annotations(fname, glfo):
    annotations = {}
    with open(fname.replace('.csv', '-cluster-annotations.csv')) as csvfile:
        reader = csv.DictReader(csvfile)
        for line in reader:  # there's a line for each cluster
            if line['v_gene'] == '':  # failed (i.e. couldn't find an annotation)
                continue
            utils.process_input_line(
                line
            )  # converts strings in the csv file to floats/ints/dicts/etc.
            utils.add_implicit_info(
                glfo, line
            )  # add stuff to <line> that's useful, isn't written to the csv since it's redundant
            # utils.print_reco_event(line)  # print ascii-art representation of the rearrangement event
            annotations[getkey(line['unique_ids'])] = line
    return annotations
#!/usr/bin/env python
import csv
import sys

partis_path = '.'  # edit this if you're not running from the main partis dir
sys.path.insert(1, partis_path + '/python')
import utils
import glutils
from clusterpath import ClusterPath

# read default germline info
glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus='igh')

print 'first parse an annotation csv file:'
with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for line in reader:
        if line['v_gene'] == '':  # failed (i.e. couldn't find an annotation)
            continue
        utils.process_input_line(line)
        utils.add_implicit_info(glfo, line)
        utils.print_reco_event(line)
        break

print 'then parse a partition csv file:'
cp = ClusterPath()
cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv')
cp.print_partitions(abbreviate=True)
예제 #3
0
def read_sequence_file(infname,
                       is_data,
                       n_max_queries=-1,
                       args=None,
                       simglfo=None,
                       quiet=False,
                       more_input_info=None):
    # NOTE renamed this from get_seqfile_info() since I'm changing the return values, but I don't want to update the calls everywhere (e.g. in compareutils)
    yaml_glfo = None
    suffix = utils.getsuffix(infname)
    if suffix in delimit_info:
        seqfile = open(
            infname
        )  # closes on function exit. no, this isn't the best way to do this
        reader = csv.DictReader(seqfile, delimiter=delimit_info[suffix])
    elif suffix in ['.fa', '.fasta', '.fastx']:
        reader = utils.read_fastx(
            infname,
            name_key='unique_ids',
            seq_key='input_seqs',
            add_info=False,
            sanitize=True,
            n_max_queries=
            n_max_queries,  # NOTE don't use istarstop kw arg here, 'cause it f***s with the istartstop treatment in the loop below
            queries=(args.queries if
                     (args is not None and not args.abbreviate) else None)
        )  # NOTE also can't filter on args.queries here if we're also translating
    elif suffix == '.yaml':
        yaml_glfo, reader, _ = utils.read_yaml_output(
            infname,
            n_max_queries=n_max_queries,
            synth_single_seqs=True,
            dont_add_implicit_info=True
        )  # not really sure that long term I want to synthesize single seq lines, but for backwards compatibility it's nice a.t.m.
        if not is_data:
            simglfo = yaml_glfo  # doesn't replace the contents, of course, which is why we return it
    else:
        raise Exception('unhandled file extension %s' % suffix)

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    # already_printed_forbidden_character_warning = False
    n_queries_added = 0
    found_seed = False
    potential_names, used_names = None, None  # for abbreviating
    iname = None  # line number -- used as sequence id if there isn't a name column in the file
    iline = -1
    for line in reader:
        iline += 1
        if args is not None:
            if args.istartstop is not None:
                if iline < args.istartstop[0]:
                    continue
                if iline >= args.istartstop[1]:
                    break
            if args.name_column is not None:
                line['unique_ids'] = line[args.name_column]
                del line[args.name_column]
            if args.seq_column is not None:
                line['input_seqs'] = line[args.seq_column]
                if args.seq_column != 'seqs':  # stupid god damn weird backwards compatibility edge case bullshit
                    del line[args.seq_column]
        if iname is None and 'unique_ids' not in line and 'unique_id' not in line:
            print '  %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % (
                utils.color('yellow', 'warning'))
            iname = 0
        if iname is not None:
            line['unique_ids'] = '%09d' % iname
            iname += 1
        if 'input_seqs' not in line and 'seq' not in line:
            raise Exception(
                'couldn\'t find a sequence column in %s (you can set this with --seq-column)'
                % infname)
        if suffix != '.yaml':
            utils.process_input_line(line)
        if len(line['unique_ids']) > 1:
            raise Exception('can\'t yet handle multi-seq csv input files')
        uid = line['unique_ids'][0]
        if uid in input_info:
            new_uid = uid
            iid = 2
            while new_uid in input_info:
                new_uid = uid + '-' + str(iid)
                iid += 1
            print '  %s uid %s already read from input file %s, so replacing with new uid %s' % (
                utils.color('yellow', 'warning'), uid, infname, new_uid)
            uid = new_uid
        inseq = line['input_seqs'][0]

        # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above if it has them
        # if any(fc in uid for fc in utils.forbidden_characters):
        #     raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid))
        if args is not None:
            if args.abbreviate:  # note that this changes <uid>, but doesn't modify <line>
                uid, potential_names, used_names = utils.choose_new_uid(
                    potential_names, used_names)
            if args.queries is not None and uid not in args.queries:
                continue
            if args.reco_ids is not None and line[
                    'reco_id'] not in args.reco_ids:
                continue
            if args.seed_unique_id is not None and uid == args.seed_unique_id:
                found_seed = True

        if uid in input_info:
            raise Exception('found uid \'%s\' twice in input file %s' %
                            (uid, infname))

        if any(c not in utils.alphabet for c in inseq):
            unexpected_chars = set(
                [ch for ch in inseq if ch not in utils.alphabet])
            raise Exception(
                'unexpected character%s %s (not among %s) in input sequence with id %s:\n  %s'
                % (utils.plural(len(unexpected_chars)), ', '.join([
                    ('\'%s\'' % ch) for ch in unexpected_chars
                ]), utils.nukes + utils.ambiguous_bases, uid, inseq))

        # da business
        input_info[uid] = {
            'unique_ids': [
                uid,
            ],
            'seqs': [
                inseq,
            ]
        }

        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s' % infname)
            reco_info[uid] = copy.deepcopy(line)
            if simglfo is not None:
                utils.add_implicit_info(simglfo, reco_info[uid])
            for line_key in utils.input_metafile_keys.values():
                if line_key in reco_info[
                        uid]:  # this is kind of weird to copy from sim info to input info, but it makes sense because affinity is really meta info (the only other place affinity could come from is --input-metafname below). Where i'm defining meta info more or less as any input info besides name and sequence (i think the distinction is only really important because we want to support fastas, which can't [shouldn't!] handle anything else))
                    input_info[uid][line_key] = copy.deepcopy(
                        reco_info[uid][line_key]
                    )  # note that the args.input_metafname stuff below should print a warning if you've also specified that (which you shouldn't, if it's simulation)

        n_queries_added += 1
        if n_max_queries > 0 and n_queries_added >= n_max_queries:
            if not quiet:  # just adding <quiet>, and too lazy to decide what other print statements it should effect, this is the only one I care about right now
                print '  --n-max-queries: stopped after reading %d queries from input file' % len(
                    input_info)
            break

    if more_input_info is not None:  # if you use this on simulation, the extra queries that aren't in <reco_info> may end up breaking something down the line (but I don't imagine this really getting used on simulation)
        if len(set(more_input_info) & set(input_info)) > 0:
            print '  %s found %d queries in both --infname and --queries-to-include-fname (note that we don\'t check here that they correspond to the same sequence): %s' % (
                utils.color('red', 'note:'),
                len(set(more_input_info) & set(input_info)),
                ' '.join(set(more_input_info) & set(input_info))
            )  # not necessarily a problem, but you probably *shouldn't* have sequences floating around in two different files
        if args is not None and args.seed_unique_id is not None and args.seed_unique_id in more_input_info:
            found_seed = True
        input_info.update(more_input_info)
    if args is not None and args.input_metafname is not None:
        read_input_metafo(args.input_metafname,
                          input_info.values(),
                          debug=True)
    post_process(input_info, reco_info, args, infname, found_seed, is_data,
                 iline)

    if len(input_info) == 0:
        raise Exception('didn\'t read any sequences from %s' % infname)

    return input_info, reco_info, yaml_glfo
#     print utils.print_reco_event(line)


# formatting necessity
def getkey(uid_list):
    return ':'.join(uid_list)


# creates a dictionary with keys = unique_ids and values = annotations
annotations = {}
with open(args.infile.replace('.csv', '-cluster-annotations.csv')) as csvfile:
    reader = csv.DictReader(csvfile)
    for line in reader:  # there's a line for each cluster
        if line['v_gene'] == '':  # failed (i.e. couldn't find an annotation)
            continue
        utils.process_input_line(
            line)  # converts strings in the csv file to floats/ints/dicts/etc.
        utils.add_implicit_info(
            glfo, line
        )  # add stuff to <line> that's useful, isn't written to the csv since it's redundant
        # utils.print_reco_event(line)  # print ascii-art representation of the rearrangement event
        annotations[getkey(line['unique_ids'])] = line

# sort by size
sorted_clusters = sorted(annotations,
                         key=lambda q: len(annotations[q]['unique_ids']),
                         reverse=True)

#### sorted_clusters = [c for c in sorted_clusters if utils.is_functional(annotations[c])] # checks if the cluster contains ANY non-functional sequences

# total size of repertoire (number sequences)
n_total = sum([len(cluster) for cluster in sorted_clusters])
예제 #5
0
def get_seqfile_info(infname,
                     is_data,
                     n_max_queries=-1,
                     args=None,
                     glfo=None,
                     simglfo=None):
    """ return list of sequence info from files of several types """

    if not is_data and glfo is None:
        print '  WARNING glfo is None, so not adding implicit info'

    suffix = os.path.splitext(infname)[1]
    if len(re.findall('\.[ct]sv', suffix)) > 0:
        if suffix == '.csv':
            delimiter = ','
        elif suffix == '.tsv':
            delimiter = '\t'
        else:
            assert False
        seqfile = open(infname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
    else:
        reader = utils.read_fastx(
            infname,
            name_key='unique_ids',
            seq_key='input_seqs',
            add_info=False,
            sanitize=True,
            queries=(args.queries if args is not None else None),
            n_max_queries=n_max_queries)

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    # already_printed_forbidden_character_warning = False
    n_queries_added = 0
    found_seed = False
    used_names = set()  # for abbreviating
    if args is not None and args.abbreviate:
        potential_names = list(string.ascii_lowercase)
    iname = None  # line number -- used as sequence id if there isn't a name column in the file
    iline = -1
    for line in reader:
        iline += 1
        if args is not None:
            if args.istartstop is not None:
                if iline < args.istartstop[0]:
                    continue
                if iline >= args.istartstop[1]:
                    break
            if args.name_column is not None:
                line['unique_ids'] = line[args.name_column]
                del line[args.name_column]
            if args.seq_column is not None:
                line['input_seqs'] = line[args.seq_column]
                if args.seq_column != 'seqs':  # stupid god damn weird backwards compatibility edge case bullshit
                    del line[args.seq_column]
        if iname is None and 'unique_ids' not in line and 'unique_id' not in line:
            print '  %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % (
                utils.color('yellow', 'warning'))
            iname = 0
        if iname is not None:
            line['unique_ids'] = '%09d' % iname
            iname += 1
        if 'input_seqs' not in line and 'seq' not in line:
            raise Exception(
                'couldn\'t find a sequence column in %s (you can set this with --seq-column)'
                % infname)
        utils.process_input_line(line)
        if len(line['unique_ids']) > 1:
            raise Exception('can\'t yet handle multi-seq csv input files')
        uid = line['unique_ids'][0]
        inseq = line['input_seqs'][0]

        # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above it has them
        # if any(fc in uid for fc in utils.forbidden_characters):
        #     raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid))
        if args is not None:
            if args.abbreviate:  # note that this changes <uid>, but doesn't modify <line>
                uid = abbreviate(used_names, potential_names, uid)
            if args.queries is not None and uid not in args.queries:
                continue
            if args.reco_ids is not None and line[
                    'reco_id'] not in args.reco_ids:
                continue
            if args.seed_unique_id is not None and uid == args.seed_unique_id:
                found_seed = True

        if uid in input_info:
            raise Exception('found uid \'%s\' twice in input file %s' %
                            (uid, infname))

        if len(inseq.translate(None, ''.join(utils.alphabet))) > 0:
            raise Exception(
                'unexpected character (not among %s) in input sequence with id %s:\n  %s'
                % (utils.nukes + utils.ambiguous_bases, uid, inseq))

        input_info[uid] = {
            'unique_ids': [
                uid,
            ],
            'seqs': [
                inseq,
            ]
        }

        if n_queries_added == 0 and is_data and 'v_gene' in line:
            print '  note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % infname

        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s' % infname)
            reco_info[uid] = copy.deepcopy(line)
            if simglfo is not None:
                utils.add_implicit_info(simglfo, reco_info[uid])

        n_queries_added += 1
        if n_max_queries > 0 and n_queries_added >= n_max_queries:
            break

    post_process(input_info, reco_info, args, infname, found_seed, is_data)

    if len(input_info) == 0:
        raise Exception('didn\'t read any sequences from %s' % infname)

    return input_info, reco_info
예제 #6
0
파일: make_profile.py 프로젝트: krdav/SPURF
def extract_seqs(fnam):
    '''
    Reads a partis cluster-annotations file and extracts relevant information and sequences.
    '''
    # Read cluster annotations into a data list of dictionaries:
    with open(fnam) as fh:
        reader = csv.DictReader(fh)
        data = list(reader)

    sequences_i = list()
    info_i = list()

    if args.allele_finding:
        fnam_base = fnam.split('_partitions')[0].split('/')
        glfo = glutils.read_glfo('{}/_output/{}/hmm/germline-sets'.format(fnam_base[0], fnam_base[-1]), locus=args.LOCUS)
    else:
        glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus=args.LOCUS)
    for row in data:
        # Process the partis data row and add germline information:
        try:
            utils.process_input_line(row)
            # Read default germline info
            utils.add_implicit_info(glfo, row)
        except Exception as e:  # Skip rows that cannot be processed
            if 'failed annotation' not in e:
                pass
                # print('First skip')
                # print(e)
            else:
                print 'Reading from'
                print '{}/_output/{}/hmm/germline-sets'.format(fnam_base[0], fnam_base[-1])
                print e
            continue

#        # Process the partis data row and add germline information:
#        try:
#            utils.process_input_line(row)
#            utils.add_implicit_info(glfo, row)
#        except:  # Skip rows that cannot be processed
#            continue

        # Extract the full N padded naive sequence,
        # and find the v -and j gene bound on this naive sequence:
        cdr3_bounds = (row['codon_positions']['v'], row['codon_positions']['j'] + 3)
        vj_bounds = (row['regional_bounds']['v'][0], row['regional_bounds']['j'][1])
        naiveDNA = row['naive_seq']
        # Skip naive sequences too short or with stop codons:
        if repair_seq(naiveDNA, naiveDNA, vj_bounds, keep_check=True) is False:
            continue
        trimmed_naiveDNA = repair_seq(naiveDNA[:], naiveDNA[:], vj_bounds)
        naiveAA = str(Seq(trimmed_naiveDNA, generic_dna).translate())

        # There has been a name change and this try/except is meant to provide backwards compatability:
        try:
            lseq = row['input_seqs'][:]
        except:
            lseq = row['seqs'][:]
        ir_lseq = row['indel_reversed_seqs']
        stop_seq = row['stops']
        assert(len(lseq) == len(ir_lseq))
        assert(len(lseq) == len(stop_seq))
        # Only keep sequences without indels and stop codons and minimum length amino acid length (QC):
        ### ir_lseq[i] == '' or lseq[i] == ir_lseq[i]  <-- No indels
        ### stop_seq[i]  <-- No partis annotated stops (there seems still to be stops after these are removed though)
        ### repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True)  <-- Checks whether the sequence is long enougth or have stop codons
        keep_idx = [1 if ((ir_lseq[i] == '' or lseq[i] == ir_lseq[i]) and stop_seq[i] is False and repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True)) else 0 for i in range(len(lseq))]

        # Now only keep those sequences that passed QC:
        lseq = [s for s, keep in zip(lseq, keep_idx) if keep == 1]
        # Get amino acid sequences:
        lAAseq = [str(Seq(repair_seq(s[:], naiveDNA[:], vj_bounds), generic_dna).translate()) for s in lseq]
        # And mutation frequencies:
        mut_freqs = [s for s, keep in zip(row['mut_freqs'], keep_idx) if keep == 1]
        assert(len(mut_freqs) == len(lseq))
        # Convert frequency to counts:
        Nmuts = [int(round(float(t[0])*len(t[1].strip('N')))) for i, t in enumerate(zip(mut_freqs, lseq))]

        # Deduplicate AAseqs and lseq according to the duplications on amino acid level:
        lAAseq_dict = dict()
        lseq_unique = list()
        for i, aa in enumerate(lAAseq):
            if aa in lAAseq_dict:
                lAAseq_dict[aa].append(i)
            else:
                lAAseq_dict[aa] = [i]
                lseq_unique.append(repair_seq(lseq[i][:], naiveDNA[:], vj_bounds))
        assert(len(lAAseq_dict) == len(lseq_unique))
        # Make the deduplicated sequence list and the mutation rates:
        lAAseq_dedup = list()
        Nmuts_dedup = list()
        for aa, idxs in lAAseq_dict.items():
            lAAseq_dedup.append(aa)
            Nmut_list = [float(Nmuts[i]) for i in idxs]
            Nmuts_dedup.append(int(round(sum(Nmut_list)/len(Nmut_list))))
        assert(len(lAAseq_dedup) == len(Nmuts_dedup))
        assert(len(lAAseq_dedup) == len(lseq_unique))

        # Exclude small clonal families after all the QC and deduplication:
        if len(lAAseq_dedup) < args.MIN_OBS:
            continue

        # Store the results in a list:
        sequences_i.append(['naive_seq', naiveAA])  # This format is for ANARCI numbering
        info_i.append({'fnam': fnam, 'v_gene': row['v_gene'], 'd_gene': row['d_gene'], 'j_gene': row['j_gene'],
                       'naive_seq': naiveAA, 'naive_seq_DNA': trimmed_naiveDNA, 'Nmuts': Nmuts_dedup[:],
                       'AAseqs': lAAseq_dedup[:], 'DNAseqs': lseq_unique[:]})
    return(sequences_i, info_i)
예제 #7
0
파일: make_profile.py 프로젝트: krdav/SPURF
def partis_naive_seq(lseq, fnam):
    '''
    Given a number of sequences infer the naive sequence using partis.
    '''
    # Specify filenames:
    pretty_random_fnam = str(random.randint(1, 10**100))
    inpf = pretty_random_fnam + '_input'
    outf = pretty_random_fnam + '_output'
    # Write input fasta file for partis:
    with open(TMPDIR+'/'+inpf+'.fa', 'w') as fho:
        for i, s in enumerate(lseq):
            fho.write('>{}\n{}\n'.format(str(i), s))
    # Run partis:
    cmd = '{}/bin/partis partition --locus {} --species {} --infname {}/{}.fa --outfname {}/{}.csv'.format(partis_path, args.LOCUS, args.SPECIES, TMPDIR, inpf, TMPDIR, outf)
    # os.system(cmd)  # Print partis STDOUT to screen
    os.system('{} > {}/{}.log'.format(cmd, TMPDIR, pretty_random_fnam))

    try:
        # Read the partis output file and extract the naive sequence:
        with open(TMPDIR+'/'+outf+'-cluster-annotations.csv') as fh:
            reader = csv.DictReader(fh)
            data = list(reader)
        # assert(len(data) == 1)  # There should really only be one clonal family, but there often are, so just take the first (largest)
        # Extract germline bounds info and trim the naive DNA sequence:
        try:
            utils.process_input_line(data[0])       # Process dataframe row
            fnam_base = fnam.split('_partitions')[0].split('/')
            #glfo = glutils.read_glfo('{}/_output/{}/hmm/germline-sets'.format(fnam_base[0], fnam_base[-1]), locus=args.LOCUS)
            glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus=args.LOCUS)
            utils.add_implicit_info(glfo, data[0])  # Adding germline infor
        except Exception as e:
            print e
            raise e

        naiveDNA = data[0]['naive_seq'][:]
        first_lseq = data[0]['input_seqs'][:][0]
        vj_bounds = (data[0]['regional_bounds']['v'][0], data[0]['regional_bounds']['j'][1])
        naiveDNA = repair_new_naive(naiveDNA[:], naiveDNA[:], vj_bounds)
        first_lseq = repair_new_naive(first_lseq, naiveDNA[:], vj_bounds)
        try:
            assert(len(first_lseq) == len(naiveDNA))
        except:
            print 'len(first_lseq) != len(data[0]["naive_seq"])'
            print len(first_lseq)
            print first_lseq
            print len(naiveDNA)
            print naiveDNA
        # If the inferred naive sequence contains a stop codon replace it by the input sequence codon:
        if '*' in str(Seq(naiveDNA, generic_dna).translate()):
            print 'Found stop codon in inferred naive sequnce, will replace with input sequence codon.'
            print 'Before replacement:', naiveDNA
            naiveDNA_l = list(naiveDNA[:])
            for codon in range(vj_bounds[0], vj_bounds[1], 3):
                if '*' == str(Seq(naiveDNA[codon:codon+3], generic_dna).translate()):
                    naiveDNA_l[codon:codon+3] = first_lseq[codon:codon+3]
            naiveDNA = ''.join(naiveDNA_l)
            print 'After replacement:', naiveDNA
        if naiveDNA == first_lseq:
            print 'Complaining to say naiveDNA == first_lseq (nothing bad just to be sure the repair is not just replacing the naive sequence with the input entirely)'

        return(naiveDNA)
    finally:
        # Clean up:
        os.system('rm -r {}/{}* _output/*{}*'.format(TMPDIR, pretty_random_fnam, pretty_random_fnam))
예제 #8
0
def extract_seqs(fnam, uid2iso):
    '''Reads a partis cluster-annotations files and extrats relevant information and sequences.'''
    # Read cluster annotations into a data list of dictionaries:
    with open(fnam) as fh:
        reader = csv.DictReader(fh)
        data = list(reader)

    sequences_i = list()
    info_i = list()
    for row in data:
        fnam_base = fnam.split('_partitions')[0]
        cwd = os.getcwd()
        if 'IgK' in fnam_base:
            locus = 'igk'
        elif 'IgL' in fnam_base:
            locus = 'igl'
        else:
            locus = 'igh'
        # Process the partis data row and add germline information:
        try:
            utils.process_input_line(row)
            # Read default germline info
            glfo = glutils.read_glfo('{}/_output/{}/hmm/germline-sets'.format(cwd, fnam_base), locus=locus)
            utils.add_implicit_info(glfo, row)
        except Exception as e:  # Skip rows that cannot be processed
            print('First skip')
            print(e)
            continue

        uids = [dl + [u] if (len(dl) > 0 and dl[0] != '') else [u] for dl, u in zip(row['duplicates'], row['unique_ids'])]

        # Extract the full N padded naive sequence,
        # and find the v -and j gene bound on this naive sequence:
        cdr3_bounds = (row['codon_positions']['v'], row['codon_positions']['j'] + 3)
        vj_bounds = (row['regional_bounds']['v'][0], row['regional_bounds']['j'][1])
        if row['invalid'] is True or (cdr3_bounds[0]-cdr3_bounds[1])%3 != 0:
            print('Invalid clonal family, skipping.')
            continue

        naiveDNA = row['naive_seq']
        if repair_seq(naiveDNA, naiveDNA, vj_bounds, keep_check=True) is False:  # Skip naive sequences too short or with stop codons:
            # print('Third skip')
            if len(row['input_seqs'][:]) > 100:
                print('Bad naive even after 100 seqs in clonal family.')
                repair_seq_debug(naiveDNA, naiveDNA, vj_bounds)
            continue
        trimmed_naiveDNA = repair_seq(naiveDNA[:], naiveDNA[:], vj_bounds)
        naiveAA = str(Seq(trimmed_naiveDNA, generic_dna).translate())

        # There has been a name change and this try/except
        # is meant to provide backwards compatability:
        try:
            lseq = row['input_seqs'][:]
        except:
            lseq = row['seqs'][:]
        ir_lseq = row['indel_reversed_seqs']
        stop_seq = row['stops']
        assert(len(lseq) == len(ir_lseq))
        assert(len(lseq) == len(stop_seq))
        # Only keep sequences without indels and stop codons and minimum length amino acid length:
        ### ir_lseq[i] == '' or lseq[i] == ir_lseq[i]  <-- No indels
        ### stop_seq[i]  <-- No partis annotated stops (there seems still to be stops after these are removed though)
        ### repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True)  <-- Checks whether the sequence is long enougth or have stop codons
        keep_idx = [1 if ((ir_lseq[i] == '' or lseq[i] == ir_lseq[i]) and stop_seq[i] is False and repair_seq(lseq[i], naiveDNA, vj_bounds, keep_check=True)) else 0 for i in range(len(lseq))]

        # Now only keep those sequences that passed QC:
        lseq = [s for s, keep in zip(lseq, keep_idx) if keep == 1]
        # Exclude small clonal families:
        if len(lseq) < MIN_OBS:
            # print(len(lseq))
            # print('Fourth skip')
            continue
        # Get amino acid sequences:
        lAAseq = [str(Seq(repair_seq(s[:], naiveDNA[:], vj_bounds), generic_dna).translate()) for s in lseq]
#        mut_freqs = [s for s, keep in zip(row['mut_freqs'], keep_idx) if keep == 1]
#        print(row['n_mutations'].split(':'))
        Nmuts = [int(s) for s, keep in zip(row['n_mutations'].split(':'), keep_idx) if keep == 1]
        abundance = [len(d) for d, keep in zip(uids, keep_idx) if keep == 1]
        uids = [s for s, keep in zip(uids, keep_idx) if keep == 1]
        assert(len(Nmuts) == len(lseq))
        assert(len(abundance) == len(lseq))
        assert(len(uids) == len(lseq))
#        assert(len(mut_freqs) == len(lseq))
        # Convert frequency to counts and throw out info for discarded sequences:
#        Nmuts = [int(round(float(t[0])*len(t[1].strip('N')))) for i, t in enumerate(zip(mut_freqs, lseq))]

        # Deduplicate AAseqs and lseq according to the AA deduplication:
        '''
        lAAseq_dict = dict()
        lAAseq_sort = dict()
        lseq_dedup = list()
        for i, aa in enumerate(lAAseq):
            if aa in lAAseq_sort:
                lAAseq_sort[aa].append((i, repair_seq(lseq[i][:], naiveDNA[:], vj_bounds), abundance[i]))
            else:
                lAAseq_sort[aa] = [(i, repair_seq(lseq[i][:], naiveDNA[:], vj_bounds), abundance[i])]

        for i, aa in enumerate(lAAseq_sort):
            lAAseq_dict[aa] = [t[0] for t in lAAseq_sort[aa]]
            s = sorted(lAAseq_sort[aa], )
            ab_seq = sorted(lAAseq_sort[aa], key=lambda x: x[2], reverse=True)[0][1]
            lseq_dedup.append(ab_seq)

        assert(len(lAAseq_dict) == len(lseq_dedup))
        # Make the deduplicated list and take the mutation rates,
        #  as the mutation rate for the deduplicated sequence:
        lAAseq_dedup = list()
        Nmuts_dedup = list()
        abundance_dedup = list()
        for aa, idxs in lAAseq_dict.items():
            lAAseq_dedup.append(aa)
            Nmut_list = [float(Nmuts[i]) for i in idxs]
            Nmuts_dedup.append(int(round(sum(Nmut_list)/len(Nmut_list))))
            abundance_list = [abundance[i] for i in idxs]
            abundance_dedup.append(sum(abundance_list))
        assert(len(lAAseq_dedup) == len(Nmuts_dedup))
        assert(len(lAAseq_dedup) == len(abundance_dedup))
        assert(len(lAAseq_dedup) == len(lseq_dedup))

        # Exclude small clonal families:
        if len(lAAseq_dedup) < MIN_OBS:
            # print(len(lseq))
            # print('Fourth skip')
            continue
        '''
        iso_list = [[uid2iso[u] for u in ul] for ul in uids]
        # Store the results in a list:
        sequences_i.append(['naive_seq', naiveAA])  # This format is for ANARCI numbering
        info_i.append({'fnam': fnam, 'v_gene': row['v_gene'], 'd_gene': row['d_gene'], 'j_gene': row['j_gene'],
                       'naive_seq': naiveAA, 'naive_seq_DNA': trimmed_naiveDNA, 'Nmuts': Nmuts[:], 'abundance': abundance[:],
                       'AAseqs': lAAseq[:], 'DNAseqs': lseq[:], 'UID': uids[:], 'isotype': iso_list[:],
                       'CDR3_start': cdr3_bounds[0], 'CDR3_end': cdr3_bounds[1]})

    return(sequences_i, info_i)
예제 #9
0
def write_partis_data_from_annotations(
    output_genes,
    output_seqs,
    path_to_annotations,
    metadata,
    filters={},
    seq_filters={},
    min_clonal_family_size=0,
    min_seq_len=0,
    max_mut_pct=1.,
    min_mut_pct=0.,
    clone_str='',
    region='v',
    germline_family='v',
):
    """
    Function to read partis annotations csv

    @param path_to_annotations: path to annotations files
    @param metadata: csv file of metadata; if None defaults will be used for chain/species
    @param filters: dictionary of lists with keys as column name and items as those values of the column variable to retain;
        filters out families, e.g., {'locus': ['igk']}, etc.
    @param seq_filters: same as filters, but for sequences, e.g., {indel_reversed_seqs': [''], 'in_frames': [False]} will
        only retain sequences that are out of frame and did not have an indel
    @param min_clonal_family_size: minimum clonal family size
    @param min_seq_len: minimum sequence length
    @param max_mut_pct: maximum mutation percentage
    @param min_mut_pct: minimum mutation percentage
    @param clone_str: string for identifying clones (useful if merging annotations from multiple datasets)
    @param region: B-cell receptor region ('v', 'd', 'j', or 'vdj')
    @param germline_family: for performing cross validation ('v', 'd', or 'j')

    @write genes to output_genes and seqs to output_seqs
    """

    families = ['v', 'd', 'j']
    if germline_family not in families:
        raise ValueError("Invalid germline_family: %s. Must be one of %s" %
                         (germline_family, families))

    regions = ['v', 'd', 'j', 'vdj']
    if region not in regions:
        raise ValueError("Invalid region: %s. Must be one of %s" %
                         (region, regions))

    PARTIS_PATH = os.path.dirname(os.path.realpath(__file__)) + '/partis'
    sys.path.insert(1, PARTIS_PATH + '/python')
    from utils import add_implicit_info, process_input_line
    import glutils

    partition_info = get_partition_info(
        path_to_annotations,
        metadata,
    )

    with open(output_genes, 'w') as genes_file, open(output_seqs,
                                                     'w') as seqs_file:
        gene_writer = csv.DictWriter(genes_file,
                                     ['germline_name', 'germline_sequence'])
        gene_writer.writeheader()

        seq_header = [
            'germline_name',
            'sequence_name',
            'sequence',
            'germline_family',
            'v_gene',
            'region',
        ]

        for key, _ in partition_info[0].iteritems():
            seq_header += [key]

        seq_writer = csv.DictWriter(seqs_file, seq_header)
        seq_writer.writeheader()
        for data_idx, data_info in enumerate(partition_info):
            if any([
                    data_info[key] not in values
                    for key, values in filters.iteritems()
            ]):
                continue
            glfo = glutils.read_glfo(data_info['germline_file'],
                                     locus=data_info['locus'])
            with open(data_info['annotations_file'], "r") as csvfile:
                reader = csv.DictReader(csvfile)
                for idx, line in enumerate(reader):
                    if line['v_gene'] == '':
                        # failed annotations
                        continue

                    # add goodies from partis
                    process_input_line(line)
                    add_implicit_info(glfo, line)
                    n_seqs = len(line['input_seqs'])
                    if n_seqs < min_clonal_family_size:
                        # don't take small clonal families---for data quality purposes
                        continue

                    if region == 'vdj':
                        gl_seq = line['naive_seq'].lower()
                        all_seqs = [seq.lower() for seq in line['seqs']]
                    else:
                        gl_seq = line['v_gl_seq'].lower()
                        all_seqs = [seq.lower() for seq in line['v_qr_seqs']]

                    idx_list = []
                    # frequency filter
                    idx_list.append(
                        set([
                            i for i, val in enumerate(line['mut_freqs'])
                            if val < max_mut_pct and val >= min_mut_pct
                        ]))
                    # sequence length filter
                    idx_list.append(
                        set([
                            i for i, val in enumerate(all_seqs)
                            if len(val.translate(None, 'n')) > min_seq_len
                        ]))
                    for key, values in seq_filters.iteritems():
                        idx_list.append(
                            set([
                                i for i, val in enumerate(line[key])
                                if val in values
                            ]))

                    good_seq_idx = set.intersection(*idx_list)
                    if not good_seq_idx:
                        # no sequences after filtering... skip
                        continue

                    gl_name = 'clone{}-{}-{}'.format(
                        *[data_idx, idx, clone_str])
                    gene_writer.writerow({
                        'germline_name': gl_name,
                        'germline_sequence': gl_seq,
                    })

                    for good_idx in good_seq_idx:
                        base_dict = {
                            'germline_name':
                            gl_name,
                            'sequence_name':
                            '-'.join([gl_name, line['unique_ids'][good_idx]]),
                            'sequence':
                            all_seqs[good_idx].lower(),
                            'germline_family':
                            line['{}_gene'.format(germline_family)][:5],
                            'v_gene':
                            line['v_gene'],
                            'region':
                            region,
                        }

                        for key, value in data_info.iteritems():
                            base_dict[key] = value

                        seq_writer.writerow(base_dict)
예제 #10
0
def get_seqfile_info(fname, is_data, glfo=None, n_max_queries=-1, queries=None, reco_ids=None, name_column=None, seq_column=None, seed_unique_id=None, abbreviate_names=False):
    """ return list of sequence info from files of several types """

    # WARNING defaults for <name_column> and <seq_column> also set in partis (since we call this from places other than partis, but we also want people to be able set them from the partis command line)
    internal_name_column = 'unique_id'  # key we use in the internal dictionaries
    internal_seq_column = 'seq'
    if name_column is None:  # header we expect in the file
        name_column = internal_name_column
    if seq_column is None:
        seq_column = internal_seq_column

    if not is_data and glfo is None:
        print '  WARNING glfo is None, so not adding implicit info'

    suffix = os.path.splitext(fname)[1]
    if len(re.findall('\.[ct]sv', suffix)) > 0:
        if suffix == '.csv':
            delimiter = ','
        elif suffix == '.tsv':
            delimiter = '\t'
        else:
            assert False
        seqfile = opener('r')(fname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
    else:
        if suffix == '.fasta' or suffix == '.fa':
            ftype = 'fasta'
        elif suffix == '.fastq' or suffix == '.fq':
             ftype = 'fastq'
        else:
            raise Exception('couldn\'t handle file extension for %s' % fname)
        reader = []
        n_fasta_queries = 0
        for seq_record in SeqIO.parse(fname, ftype):

            # if command line specified query or reco ids, skip other ones (can't have/don't allow simulation info in a fast[aq])
            if queries is not None and seq_record.name not in queries:
                continue

            reader.append({})
            reader[-1][name_column] = seq_record.name
            reader[-1][seq_column] = str(seq_record.seq).upper()
            n_fasta_queries += 1
            if n_max_queries > 0 and n_fasta_queries >= n_max_queries:
                break

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    n_queries = 0
    found_seed = False
    used_names = set()  # for abbreviating
    if abbreviate_names:
        potential_names = list(string.ascii_lowercase)
    for line in reader:
        if name_column not in line or seq_column not in line:
            raise Exception('mandatory headers \'%s\' and \'%s\' not both present in %s    (you can set column names with --name-column and --seq-column)' % (name_column, seq_column, fname))
        if name_column != internal_name_column or seq_column != internal_seq_column:
            translate_columns(line, {name_column : internal_name_column, seq_column: internal_seq_column})
        utils.process_input_line(line)
        unique_id = line[internal_name_column]

        ## Actually deal with colons properly since they come up VERY OFTEN in sequence IDs
        unique_id = unique_id.replace(":", "_")
        if any(fc in unique_id for fc in utils.forbidden_characters):
            raise Exception('found a forbidden character (one of %s) in sequence id \'%s\' -- sorry, you\'ll have to replace it with something else' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), unique_id))

        if abbreviate_names:
            unique_id = abbreviate(used_names, potential_names, unique_id)

        # if command line specified query or reco ids, skip other ones
        if queries is not None and unique_id not in queries:
            continue
        if reco_ids is not None and line['reco_id'] not in reco_ids:
            continue

        if unique_id in input_info:
            raise Exception('found id %s twice in file %s' % (unique_id, fname))

        if seed_unique_id is not None and unique_id == seed_unique_id:
            found_seed = True

        input_info[unique_id] = {'unique_id' : unique_id, 'seq' : line[internal_seq_column]}

        if n_queries == 0 and is_data and 'v_gene' in line:
            print '  note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % fname

        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s' % fname)
            reco_info[unique_id] = copy.deepcopy(line)
            reco_info[unique_id]['unique_id'] = unique_id  # in case we're abbreviating
            if glfo is not None:
                utils.add_implicit_info(glfo, reco_info[unique_id], multi_seq=False, existing_implicit_keys=('cdr3_length', ))  # single seqs, since each seq is on its own line in the file

        n_queries += 1
        if n_max_queries > 0 and n_queries >= n_max_queries:
            break

    if len(input_info) == 0:
        raise Exception('didn\'t end up pulling any input info out of %s while looking for queries: %s reco_ids: %s\n' % (fname, str(queries), str(reco_ids)))
    if seed_unique_id is not None and not found_seed:
        raise Exception('couldn\'t find seed %s in %s' % (seed_unique_id, fname))

    return (input_info, reco_info)
예제 #11
0
def get_seqfile_info(infname, is_data, n_max_queries=-1, args=None, glfo=None, simglfo=None):
    """ return list of sequence info from files of several types """

    if not is_data and glfo is None:
        print '  WARNING glfo is None, so not adding implicit info'

    suffix = os.path.splitext(infname)[1]
    if len(re.findall('\.[ct]sv', suffix)) > 0:
        if suffix == '.csv':
            delimiter = ','
        elif suffix == '.tsv':
            delimiter = '\t'
        else:
            assert False
        seqfile = opener('r')(infname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
    else:
        if suffix == '.fasta' or suffix == '.fa':
            ftype = 'fasta'
        elif suffix == '.fastq' or suffix == '.fq':
             ftype = 'fastq'
        else:
            raise Exception('couldn\'t handle file extension for %s' % infname)
        reader = []
        n_fasta_queries = 0
        already_printed_forbidden_character_warning = False
        for seq_record in SeqIO.parse(infname, ftype):

            # if command line specified query or reco ids, skip other ones (can't have/don't allow simulation info in a fast[aq])
            if args is not None and args.queries is not None and seq_record.name not in args.queries:
                continue

            reader.append({})

            uid = seq_record.name
            if any(fc in uid for fc in utils.forbidden_characters):
                if not already_printed_forbidden_character_warning:
                    print '  %s: found a forbidden character (one of %s) in sequence id \'%s\'. This means we\'ll be replacing each of these forbidden characters with a single letter from their name (in this case %s). If this will cause problems you should replace the characters with something else beforehand.' % (utils.color('yellow', 'warning'), ' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid, uid.translate(utils.forbidden_character_translations))
                    already_printed_forbidden_character_warning = True
                uid = uid.translate(utils.forbidden_character_translations)

            reader[-1]['unique_ids'] = uid
            reader[-1]['input_seqs'] = str(seq_record.seq).upper()
            n_fasta_queries += 1
            if n_max_queries > 0 and n_fasta_queries >= n_max_queries:
                break

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    # already_printed_forbidden_character_warning = False
    n_queries_added = 0
    found_seed = False
    used_names = set()  # for abbreviating
    if args is not None and args.abbreviate:
        potential_names = list(string.ascii_lowercase)
    iname = None  # line number -- used as sequence id if there isn't a name column in the file
    iline = -1
    for line in reader:
        iline += 1
        if args is not None:
            if args.istartstop is not None:
                if iline < args.istartstop[0]:
                    continue
                if iline >= args.istartstop[1]:
                    break
            if args.name_column is not None:
                line['unique_ids'] = line[args.name_column]
                del line[args.name_column]
            if args.seq_column is not None:
                line['input_seqs'] = line[args.seq_column]
                del line[args.seq_column]
        if iname is None and 'unique_ids' not in line and 'unique_id' not in line:
            print '  %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % (utils.color('yellow', 'warning'))
            iname = 0
        if iname is not None:
            line['unique_ids'] = '%09d' % iname
            iname += 1
        if 'input_seqs' not in line and 'seqs' not in line and 'seq' not in line:
            raise Exception('couldn\'t find a sequence column in %s (you can set this with --seq-column)' % infname)
        utils.process_input_line(line)
        if len(line['unique_ids']) > 1:
            raise Exception('can\'t yet handle multi-seq csv input files')
        uid = line['unique_ids'][0]
        inseq = line['input_seqs'][0]

        # NOTE I just moved this to the .fa loop, since otherwise we have no way of knowing how to interpret special characters... nevertheless if someone passesin a csv with special characters as part of a uid this will break
        # if any(fc in uid for fc in utils.forbidden_characters):
        #     if not already_printed_forbidden_character_warning:
        #         print '  %s: found a forbidden character (one of %s) in sequence id \'%s\'. This means we\'ll be replacing each of these forbidden characters with a single letter from their name (in this case %s). If this will cause problems you should replace the characters with something else beforehand.' % (utils.color('yellow', 'warning'), ' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid, uid.translate(utils.forbidden_character_translations))
        #         already_printed_forbidden_character_warning = True
        #     uid = uid.translate(utils.forbidden_character_translations)
        if args is not None:
            if args.abbreviate:  # note that this changes <uid>, but doesn't modify <line>
                uid = abbreviate(used_names, potential_names, uid)
            if args.queries is not None and uid not in args.queries:
                continue
            if args.reco_ids is not None and line['reco_id'] not in args.reco_ids:
                continue
            if args.seed_unique_id is not None and uid == args.seed_unique_id:
                found_seed = True

        if uid in input_info:
            raise Exception('found uid \'%s\' twice in input file %s' % (uid, infname))

        if len(inseq.translate(None, ''.join(utils.alphabet))) > 0:
            raise Exception('unexpected character (not among %s) in input sequence with id %s:\n  %s' % (utils.nukes + utils.ambiguous_bases, uid, inseq))

        input_info[uid] = {'unique_ids' : [uid, ], 'seqs' : [inseq, ]}

        if n_queries_added == 0 and is_data and 'v_gene' in line:
            print '  note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % infname

        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s' % infname)
            reco_info[uid] = copy.deepcopy(line)
            if simglfo is not None:
                utils.add_implicit_info(simglfo, reco_info[uid])

        n_queries_added += 1
        if n_max_queries > 0 and n_queries_added >= n_max_queries:
            break

    if args is not None:
        if args.istartstop is not None:
            n_lines_in_file = iline + 1
            if n_lines_in_file < args.istartstop[1]:
                raise Exception('--istartstop upper bound %d larger than number of lines in file %d' % (args.istartstop[1], n_lines_in_file))
        if len(input_info) == 0:
            if args.queries is not None:
                raise Exception('didn\'t find the specified --queries (%s) in %s' % (str(args.queries), infname))
            if args.reco_ids is not None:
                raise Exception('didn\'t find the specified --reco-ids (%s) in %s' % (str(args.reco_ids), infname))
        if args.queries is not None:
            missing_queries = set(args.queries) - set(input_info)
            extra_queries = set(input_info) - set(args.queries)  # this is just checking for a bug in the code just above here...
            if len(missing_queries) > 0:
                raise Exception('didn\'t find some of the specified --queries: %s' % ' '.join(missing_queries))
            if len(extra_queries) > 0:
                raise Exception('extracted uids %s that weren\'t specified with --queries' % ' '.join(extra_queries))
        if args.seed_unique_id is not None:
            if found_seed:
                if args.seed_seq is not None:  # and input_info[args.seed_unique_id]['seqs'][0] != args.seed_seq:
                    # raise Exception('incompatible --seed-unique-id and --seed-seq (i.e. the sequence in %s corresponding to %s wasn\'t %s)' % (infname, args.seed_unique_id, args.seed_seq))
                    raise Exception('--seed-seq was specified, but --seed-unique-id was also present in input file')
            else:
                if args.seed_seq is None:
                    raise Exception('couldn\'t find seed unique id %s in %s' % (args.seed_unique_id, infname))
                add_seed_seq(args, input_info, reco_info, is_data)
        elif args.seed_seq is not None:
            args.seed_unique_id = 'seed-seq'
            add_seed_seq(args, input_info, reco_info, is_data)
        elif args.random_seed_seq:  # already checked (in bin/partis) that other seed args aren't set
            args.seed_unique_id = random.choice(input_info.keys())
            print '    chose random seed unique id %s' % args.seed_unique_id

    if len(input_info) == 0:
        raise Exception('didn\'t read any sequences from %s' % infname)

    return input_info, reco_info
예제 #12
0
def _get_clonal_family_stats(path_to_annotations,
                             metadata,
                             use_np=False,
                             use_immunized=False,
                             locus=''):
    '''
    get data statistics from partis annotations

    @param path_to_annotations: path to partis annotations
    @param metadata: path to partis metadata 
    @param use_np: use nonproductive seqs?
    @param use_immunized: for Cui data, use immunized mice?
    @param locus: which locus to use

    @return list of dicts with clonal family sizes and naive seqs from processed data
    '''

    partition_info = get_partition_info(
        path_to_annotations,
        metadata,
    )

    if use_np:
        # return only nonproductive sequences
        # here "nonproductive" is defined as having a stop codon or being
        # out of frame or having a mutated conserved cysteine
        good_seq = lambda seqs: seqs['stops'] or not seqs['in_frames'] or seqs[
            'mutated_invariants']
    else:
        # return all sequences
        good_seq = lambda seqs: [True for seq in seqs['seqs']]

    all_germline_dicts = []
    for data_idx, data_info in enumerate(partition_info):
        if use_immunized and data_info['group'] != 'immunized':
            continue
        if not locus or data_info['locus'] != locus:
            continue
        PARTIS_PATH = os.path.dirname(os.path.realpath(__file__)) + '/partis'
        sys.path.insert(1, PARTIS_PATH + '/python')
        from utils import add_implicit_info, process_input_line
        import glutils
        glfo = glutils.read_glfo(data_info['germline_file'],
                                 locus=data_info['locus'])
        with open(data_info['annotations_file'], "r") as csvfile:
            reader = csv.DictReader(csvfile)
            for idx, line in enumerate(reader):
                # add goodies from partis
                if len(line['input_seqs']) == 0:
                    # sometimes data will have empty clusters
                    continue
                process_input_line(line)
                add_implicit_info(glfo, line)
                good_seq_idx = [
                    i for i, is_good in enumerate(good_seq(line)) if is_good
                ]
                if not good_seq_idx:
                    # no nonproductive sequences... skip
                    continue
                else:
                    all_germline_dicts.append({
                        'n_taxa':
                        len(good_seq_idx),
                        'germline_sequence':
                        disambiguate(line['v_gl_seq'].lower()),
                        'germline_name':
                        '-'.join([line['v_gene'], str(idx)]),
                        'v_call':
                        line['v_gene'],
                    })

    return all_germline_dicts
예제 #13
0
def get_seqfile_info(fname, is_data, glfo=None, n_max_queries=-1, queries=None, reco_ids=None):
    """ return list of sequence info from files of several types """

    suffix = os.path.splitext(fname)[1]
    if suffix == '.csv':
        delimiter = ','
        name_column = 'unique_id'
        seq_column = 'seq'
        seqfile = opener('r')(fname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
    elif suffix == '.tsv':
        delimiter = '\t'
        name_column = 'name'
        seq_column = 'nucleotide'
        seqfile = opener('r')(fname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
    else:
        if suffix == '.fasta' or suffix == '.fa':
            ftype = 'fasta'
        elif suffix == '.fastq' or suffix == '.fq':
             ftype = 'fastq'
        else:
            raise Exception('couldn\'t handle file extension for %s' % fname)
        name_column = 'unique_id'
        seq_column = 'seq'
        reader = []
        n_fasta_queries = 0
        for seq_record in SeqIO.parse(fname, ftype):

            # if command line specified query or reco ids, skip other ones
            if queries is not None and seq_record.name not in queries:
                continue
            # if reco_ids is not None and line['reco_id'] not in reco_ids:  # probably no reco ids in a fasta file
            #     continue

            reader.append({})
            reader[-1][name_column] = seq_record.name
            reader[-1][seq_column] = str(seq_record.seq).upper()
            n_fasta_queries += 1
            if n_max_queries > 0 and n_fasta_queries >= n_max_queries:
                break

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    n_queries = 0
    for line in reader:
        if '.csv' in fname and name_column not in line:  # hackey hackey hackey
            name_column = 'name'
            seq_column = 'nucleotide'
        utils.process_input_line(line)
        unique_id = line[name_column]
        if ':' in unique_id:
            raise Exception('found a \':\' in sequence id \'%s\' -- you\'ll have to replace it with something else, as we use \':\'s internally to concatenate sequence ids' % unique_id)

        # if command line specified query or reco ids, skip other ones
        if queries is not None and unique_id not in queries:
            continue
        if reco_ids is not None and line['reco_id'] not in reco_ids:
            continue

        input_info[unique_id] = {'unique_id' : unique_id, 'seq' : line[seq_column]}
        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s -- if this is data add option --is-data' % fname)
            reco_info[unique_id] = dict(line)
            if 'indels' in line and line['indels']['reversed_seq'] != '':  # TODO unhackify this
                reco_info[unique_id]['seq'] = line['indels']['reversed_seq']
            if 'indels' not in line:  # TODO unhackify this
                reco_info[unique_id]['indels'] = None
            if glfo is not None:
                utils.remove_implicit_info(reco_info[unique_id], multi_seq=False)
                utils.add_implicit_info(glfo, reco_info[unique_id], multi_seq=False)  # each seq is on its own line in the file
        n_queries += 1
        if n_max_queries > 0 and n_queries >= n_max_queries:
            break

    if len(input_info) == 0:
        raise Exception('didn\'t end up pulling any input info out of %s while looking for queries: %s reco_ids: %s\n' % (fname, str(queries), str(reco_ids)))
    
    return (input_info, reco_info)
import sys

partis_path = '.'  # edit this if you're not running from the main partis dir
sys.path.insert(1, partis_path + '/python')
import utils
import glutils
from clusterpath import ClusterPath

# read default germline info
glfo = glutils.read_glfo(partis_path + '/data/germlines/human', chain='h')

print 'first parse an annotation csv file:'
with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile:
    reader = csv.DictReader(csvfile)
    for line in reader:
        utils.process_input_line(line)
        utils.add_implicit_info(glfo, line)
        utils.print_reco_event(glfo['seqs'], line)
        cdr3_bounds = (line['codon_positions']['v'], line['codon_positions']['j'] + 3)
        print ''
        print '  should match the above:'
        print '    %s naive cdr3' % line['naive_seq'][cdr3_bounds[0] : cdr3_bounds[1]]
        print '    %s mature' % line['indel_reversed_seqs'][0][cdr3_bounds[0] : cdr3_bounds[1]]
        print ''
        break

print 'then parse a partition csv file:'
cp = ClusterPath()
cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv')
cp.print_partitions(abbreviate=True)
예제 #15
0
def get_seqfile_info(fname, is_data, glfo=None, n_max_queries=-1, queries=None, reco_ids=None):
    """ return list of sequence info from files of several types """

    if '.csv' in fname:
        delimiter = ','
        name_column = 'unique_id'
        seq_column = 'seq'
        seqfile = opener('r')(fname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
    elif '.tsv' in fname:
        delimiter = '\t'
        name_column = 'name'
        seq_column = 'nucleotide'
        seqfile = opener('r')(fname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
    elif '.fasta' in fname or '.fa' in fname or '.fastq' in fname or '.fq' in fname:
        name_column = 'unique_id'
        seq_column = 'seq'
        reader = []
        n_fasta_queries = 0
        ftype = 'fasta' if ('.fasta' in fname or '.fa' in fname) else 'fastq'
        for seq_record in SeqIO.parse(fname, ftype):
            reader.append({})
            reader[-1][name_column] = seq_record.name
            reader[-1][seq_column] = str(seq_record.seq).upper()
            n_fasta_queries += 1
            if n_max_queries > 0 and n_fasta_queries >= n_max_queries:
                break
    else:
        raise Exception('unrecognized file format %s' % fname)

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    n_queries = 0
    for line in reader:
        if '.csv' in fname and name_column not in line:  # hackey hackey hackey
            name_column = 'name'
            seq_column = 'nucleotide'
        utils.process_input_line(line, int_columns=('v_5p_del', 'd_5p_del', 'cdr3_length', 'j_5p_del', 'j_3p_del', 'd_3p_del', 'v_3p_del'), literal_columns=('indels'))
        unique_id = line[name_column]
        # if command line specified query or reco ids, skip other ones
        if queries is not None and unique_id not in queries:
            continue
        if reco_ids is not None and line['reco_id'] not in reco_ids:
            continue

        input_info[unique_id] = {'unique_id' : unique_id, 'seq' : line[seq_column]}
        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s -- if this is data add option --is-data' % fname)
            reco_info[unique_id] = dict(line)
            if 'indels' in line and line['indels']['reversed_seq'] != '':  # TODO unhackify this
                reco_info[unique_id]['seq'] = line['indels']['reversed_seq']
            if 'indels' not in line:  # TODO unhackify this
                reco_info[unique_id]['indels'] = None
            if glfo is not None:
                utils.add_match_info(glfo, reco_info[unique_id])
        n_queries += 1
        if n_max_queries > 0 and n_queries >= n_max_queries:
            break

    if len(input_info) == 0:
        raise Exception('didn\'t end up pulling any input info out of %s while looking for queries: %s reco_ids: %s\n' % (fname, str(queries), str(reco_ids)))
    
    return (input_info, reco_info)
예제 #16
0
def get_seqfile_info(infname,
                     is_data,
                     n_max_queries=-1,
                     args=None,
                     simglfo=None,
                     quiet=False):
    """ return list of sequence info from files of several types """

    suffix = utils.getsuffix(infname)
    if len(re.findall('\.[ct]sv', suffix)) > 0:
        if suffix == '.csv':
            delimiter = ','
        elif suffix == '.tsv':
            delimiter = '\t'
        else:
            assert False
        seqfile = open(infname)
        reader = csv.DictReader(seqfile, delimiter=delimiter)
    else:
        reader = utils.read_fastx(
            infname,
            name_key='unique_ids',
            seq_key='input_seqs',
            add_info=False,
            sanitize=True,
            n_max_queries=
            n_max_queries,  # NOTE don't use istarstop kw arg here, 'cause it f***s with the istartstop treatment in the loop below
            queries=(args.queries if
                     (args is not None and not args.abbreviate) else None)
        )  # NOTE also can't filter on args.queries here if we're also translating

    input_info = OrderedDict()
    reco_info = None
    if not is_data:
        reco_info = OrderedDict()
    # already_printed_forbidden_character_warning = False
    n_queries_added = 0
    found_seed = False
    used_names = set()  # for abbreviating
    if args is not None and args.abbreviate:
        potential_names = list(string.ascii_lowercase)
    iname = None  # line number -- used as sequence id if there isn't a name column in the file
    iline = -1
    for line in reader:
        iline += 1
        if args is not None:
            if args.istartstop is not None:
                if iline < args.istartstop[0]:
                    continue
                if iline >= args.istartstop[1]:
                    break
            if args.name_column is not None:
                line['unique_ids'] = line[args.name_column]
                del line[args.name_column]
            if args.seq_column is not None:
                line['input_seqs'] = line[args.seq_column]
                if args.seq_column != 'seqs':  # stupid god damn weird backwards compatibility edge case bullshit
                    del line[args.seq_column]
        if iname is None and 'unique_ids' not in line and 'unique_id' not in line:
            print '  %s: couldn\'t find a name (unique id) column, so using line number as the sequence label (you can set the name column with --name-column)' % (
                utils.color('yellow', 'warning'))
            iname = 0
        if iname is not None:
            line['unique_ids'] = '%09d' % iname
            iname += 1
        if 'input_seqs' not in line and 'seq' not in line:
            raise Exception(
                'couldn\'t find a sequence column in %s (you can set this with --seq-column)'
                % infname)
        utils.process_input_line(line)
        if len(line['unique_ids']) > 1:
            raise Exception('can\'t yet handle multi-seq csv input files')
        uid = line['unique_ids'][0]
        if uid in input_info:
            new_uid = uid
            iid = 2
            while new_uid in input_info:
                new_uid = uid + '-' + str(iid)
                iid += 1
            print '  %s uid %s already read from input file %s, so replacing with new uid %s' % (
                utils.color('yellow', 'warning'), uid, infname, new_uid)
            uid = new_uid
        inseq = line['input_seqs'][0]

        # # it would be nice to check here for forbidden characters (in addition to in the .fa code above), but it's hard because we won't have read the csv properly above it has them
        # if any(fc in uid for fc in utils.forbidden_characters):
        #     raise Exception('found a forbidden character (one of %s) in sequence id \'%s\'' % (' '.join(["'" + fc + "'" for fc in utils.forbidden_characters]), uid))
        if args is not None:
            if args.abbreviate:  # note that this changes <uid>, but doesn't modify <line>
                uid = abbreviate(used_names, potential_names, uid)
            if args.queries is not None and uid not in args.queries:
                continue
            if args.reco_ids is not None and line[
                    'reco_id'] not in args.reco_ids:
                continue
            if args.seed_unique_id is not None and uid == args.seed_unique_id:
                found_seed = True

        if uid in input_info:
            raise Exception('found uid \'%s\' twice in input file %s' %
                            (uid, infname))

        if len(inseq.translate(None, ''.join(utils.alphabet))) > 0:
            unexpected_chars = set(
                [ch for ch in inseq if ch not in utils.alphabet])
            raise Exception(
                'unexpected character%s %s (not among %s) in input sequence with id %s:\n  %s'
                % (utils.plural(len(unexpected_chars)), ', '.join([
                    ('\'%s\'' % ch) for ch in unexpected_chars
                ]), utils.nukes + utils.ambiguous_bases, uid, inseq))

        # da business
        input_info[uid] = {
            'unique_ids': [
                uid,
            ],
            'seqs': [
                inseq,
            ]
        }

        if n_queries_added == 0 and is_data and 'reco_id' in line:
            print '  note: found simulation info in %s -- are you sure you didn\'t mean to set --is-simu?' % infname

        if not is_data:
            if 'v_gene' not in line:
                raise Exception('simulation info not found in %s' % infname)
            reco_info[uid] = copy.deepcopy(line)
            if simglfo is not None:
                utils.add_implicit_info(simglfo, reco_info[uid])

        n_queries_added += 1
        if n_max_queries > 0 and n_queries_added >= n_max_queries:
            if not quiet:  # just adding <quiet>, and too lazy to decide what other print statements it should effect, this is the only one I care about right now
                print '  --n-max-queries: stopped after reading %d queries from input file' % len(
                    input_info)
            break

    post_process(input_info, reco_info, args, infname, found_seed, is_data,
                 iline)

    if len(input_info) == 0:
        raise Exception('didn\'t read any sequences from %s' % infname)

    return input_info, reco_info
예제 #17
0
def run_partis(seq):
    '''
    Infer VDJ genes and the naive sequence using partis.
    '''
    # Specify filenames:
    pretty_random_fnam = str(random.randint(1, 10**100))
    inpf = pretty_random_fnam + '_input'
    outf = pretty_random_fnam + '_output'
    # Write input fasta file for partis:
    with open(TMPDIR + '/' + inpf + '.fa', 'w') as fho:
        fho.write('>{}\n{}\n'.format('input_sequence', seq))
    # Run partis:
    cmd = '{}/bin/partis annotate --locus {} --species {} --infname {}/{}.fa --outfname {}/{}.csv'.format(
        partis_path, args.LOCUS, args.SPECIES, TMPDIR, inpf, TMPDIR, outf)
    os.system('{} > {}/{}.log'.format(cmd, TMPDIR, pretty_random_fnam))

    try:
        # Read the partis output file and extract the naive sequence:
        with open(TMPDIR + '/' + outf + '.csv') as fh:
            reader = csv.DictReader(fh)
            data = list(reader)
        ann = data[0]
        # Extract germline bounds info and trim the naive DNA sequence:
        try:
            utils.process_input_line(ann)  # Process dataframe row
            utils.add_implicit_info(glfo, ann)  # Adding germline infor
        except Exception as e:
            print e
            raise e

        if ann['stops'] is True:
            raise Exception(
                'Input sequence contain stop codon. This is no valid.')
        elif ann['v_5p_del'] > 30 or ann['j_3p_del'] > 12:
            raise Exception(
                'Incomplete input sequence error. 5-prime end missing {} nt and 3-prime missing {} nt. Max allowed is 30 and 12, respectively.'
                .format(ann['v_5p_del'], ann['j_3p_del']))
        elif ann['indelfos'][0]['indels']:
            raise Exception(
                'Input sequence contains indels, this is currently not supported.'
            )

        # Extract full size VDJ sequence for both the inferred naive and the input:
        full_gl_v = glfo['seqs']['v'][ann['v_gene']]  # Germline V
        full_gl_j = glfo['seqs']['j'][ann['j_gene']]  # Germline J

        gl_v_5p_del = full_gl_v[:ann[
            'v_5p_del']]  # 5-prime not included in input
        gl_j_3p_del = full_gl_j[(
            len(full_gl_j) -
            ann['j_3p_del']):]  # 3-prime not included in input
        #assert full_gl_v[ann['v_5p_del']:] == ann['v_gl_seq']
        naiveDNA = gl_v_5p_del + ann[
            'naive_seq'] + gl_j_3p_del  # Add the missing positions
        full_input_seq = 'N' * ann['v_5p_del'] + ann['input_seqs'][
            0] + 'N' * ann['j_3p_del']  # N pad the input sequence
        assert (len(naiveDNA) == len(full_input_seq))

        # Remove the untranslated end:
        if len(naiveDNA) % 3 != 0:
            naiveDNA = naiveDNA[0:-(len(naiveDNA) % 3)]
        if len(full_input_seq) % 3 != 0:
            full_input_seq = full_input_seq[0:-(len(full_input_seq) % 3)]
        if len(naiveDNA) != len(full_input_seq):
            raise Exception(
                'Sequences not equally long after trimming.\nInput: {}\nNaive: {}\n.'
                .format(full_input_seq, naiveDNA))

        # Replace Ns in input sequence with naive DNA bases:
        full_input_seq = repair_seq(full_input_seq, naiveDNA[:])

        # If the inferred naive sequence contains a stop codon replace it by the input sequence codon:
        if '*' in str(Seq(naiveDNA, generic_dna).translate()):
            print 'Found stop codon in inferred naive sequnce, will replace with input sequence codon.'
            print 'Before replacement:', naiveDNA
            naiveDNA_l = list(naiveDNA[:])
            for codon in range(0, len(naiveDNA), 3):
                if '*' == str(
                        Seq(naiveDNA[codon:codon + 3],
                            generic_dna).translate()):
                    naiveDNA_l[codon:codon + 3] = full_input_seq[codon:codon +
                                                                 3]
            naiveDNA = ''.join(naiveDNA_l)
            print 'After replacement:', naiveDNA
        if '*' in str(Seq(naiveDNA, generic_dna).translate()):
            raise Exception('Naive sequence could not be repaired.')
        if naiveDNA == full_input_seq:
            print 'Warning: input sequence is identical to the inferred naive sequence.'
    finally:
        # Clean up:
        os.system('rm -r {}/{}* _output/*{}*'.format(TMPDIR,
                                                     pretty_random_fnam,
                                                     pretty_random_fnam))
    return (naiveDNA, full_input_seq, (ann['v_gene'], ann['d_gene'],
                                       ann['j_gene']))