Пример #1
0
def main(args):

    container = None

    for infile in args.files:

        obj = bioio.load(infile, options=args.io_opts or [])
        cout('reading %d sequences from %s' % (len(obj), infile))
        if container is None:
            container = obj
        else:
            container += obj

    indexes = []
    counter = 0
    for s in container:
        counter += 1
        new_label = '%04d' % counter
        indexes.append((new_label, s.label))
        s.label = new_label

    if args.outfile:
        bioio.save(container, args.outfile, options=args.io_opts or [])

    if args.tabfile:
        with open(args.tabfile, 'w') as f:
            for i in indexes:
                f.write('%s\t%s\n' % i)
Пример #2
0
    def view(self, obj):

        frame_class = None
        m = None

        print(obj)
        if hasattr(obj, 'edit_bases'):
            print(obj)

            from insane.core.trace.frame import TraceFrame
            from insane.core.trace.model import tracemodel
            m = tracemodel(obj)
            frame_class = TraceFrame

        elif hasattr(obj, 'type'):

            from insane.core.msa.frame import SequenceFrame
            from insane.core.msa.model import MSA
            m = MSA( obj )
            frame_class = SequenceFrame
            cout('MSA prepared')

        if frame_class and m is not None:
            frame = frame_class( self.get_mainwin().default_env, m)
            cout('Frame created')
            win = self.get_mainwin()
            if win.mainframe() is None:
                win.setWindowTitle( m.filename() + ' - seqpy/InSAnE' )
                win.show_centralwidget( frame )
            else:
                from insane.core.main.mainwin import IMainWindow
                w = IMainWindow()
                w.setWindowTitle( m.filename() + ' - seqpy/InSAnE' )
                w.show_centralwidget( frame )
                w.show()
Пример #3
0
def start_app(arg):
    global app
    cout("Starting GUI\n")

    if not app:
        app = QtGui.QApplication.instance()
        if not app:
            if type(arg) == list:
                app = QtGui.QApplication(arg)
            else:
                app = QtGui.QApplication(['__builtin__'])

    w = IMainWindow()

    if type(arg) == list and len(arg) >= 2:
        w.load(arg[1])
    elif type(arg) == str:
        w.load(arg)
    else:
        cout('viewing...')
        w.view(arg)
        #w.setFocus()
    w.show()
    app.exec_()

    try:
        w.hide()
        del w
    except RuntimeError:
        pass
Пример #4
0
def main(args):

    aaseqs = bioio.multisequence()

    if args.start_sequence:
        args.start_sequence = args.start_sequence.upper().encode('ASCII')

    for infile in args.files:

        mseq = bioio.load(infile, options=args.io_opts)
        cout('reading %d sequences from %s' % (len(mseq), infile))

        for seq in mseq:
            aaseq = seq.clone()
            if args.start_sequence:
                # we use search restriction pattern function to locate
                # the position
                target_seq = funcs.uppercased(funcs.degapped(seq))
                res = funcs.search_restriction_site(target_seq,
                                                    args.start_sequence)
                if len(res) != 1:
                    continue
                print(target_seq[res[0][0]:res[0][0] + 30])
                aaseq.set_sequence(
                    funcs.translated(target_seq, start_pos=res[0][0] + 1))
            else:
                aaseq.set_sequence(
                    funcs.translated(seq, start_pos=args.start_codon))
            aaseqs.append(aaseq)

    bioio.save(aaseqs, args.outfile)
Пример #5
0
    def __init__(self, view, blink=True):
        super(SequenceCaret, self).__init__()
        cout('SequenceCaret.__init__() executed!')
        assert view._caret is None

        # set the view which we are anchoring to
        self._view = view

        # cursor idx, pos coordinate
        self.cur_idx = -1
        self.cur_pos = -1
        self.next_idx = -1
        self.next_pos = -1

        # cursor x,y coordinate
        self.cur_x = -1  # translate from pos
        self.cur_y = -1  # translate from idx

        # cursor size
        self.w = -1
        self.h = -1

        # blinking purposes
        self._blink = blink
        self._counter = 0
        self._timerid = None
        self._visible = False
        self._revimg = None
        self._norimg = None
Пример #6
0
def align(seqs, method=None, matrix='DNA'):
    """ aligned a list of sequences in seqs, returning a list of aligned sequences """

    if len(seqs) == 2:
        # perform pairwise alignment

        from seqpy.core.pwaligner import calign
        s_0 = degapped(seqs[0])
        s_1 = degapped(seqs[1])

        if not method:
            method = 'global_cfe'

        a_0, a_1, score = calign.aligner(s_0.upper(),
                                         s_1.upper(),
                                         method=method,
                                         matrix=matrix)
        cout('pairwise aligned with score: %f' % score)
        return (preserve_case(s_0, a_0), preserve_case(s_1, a_1), score)

    elif len(seqs) > 2:
        # perform multiple sequence alignment
        if method is None or method.startswith('muscle'):
            pass

    else:
        raise RuntimerError('Alignment must involve 2 or more sequences')
Пример #7
0
def main( args ):

    mseq = bioio.load( args.infile, options = args.io_opts )
    cout('reading %d sequences from %s' % (len(mseq), args.infile))
    c_mseq = funcs.condensed( mseq )
    bioio.save( c_mseq, args.outfile )

    if args.report:
        write_report(c_mseq, args.report)
Пример #8
0
def main(args):

    container = None

    for infile in args.files:

        obj = bioio.load(infile, options=args.io_opts or [])
        cout('reading %d sequences from %s' % (len(obj), infile))
        if container is None:
            container = obj
        else:
            container += obj

    append_attributes(container, args.src, args.src_isolate, args.definition)

    if args.accno:
        set_label_to_accno(container)

    if args.degap:
        container.degap()

    if args.minlen > 0 or args.maxlen > 0 or args.maxN > 0:
        new_container = bioio.multisequence()
        for s in container:
            if args.minlen > 0 and len(s) < args.minlen:
                continue
            if args.maxlen > 0 and len(s) > args.maxlen:
                continue
            if args.maxN > 0 and s.seq.count(b'N') / len(s) > args.maxN:
                continue
            new_container.append(s)

        container = new_container

    if args.sort:
        if args.sort.startswith('len'):
            container.sort(lambda x: len(x), reverse=True)
        elif args.sort.startswith('lab'):
            container.sort(lambda x: x.label)

    if args.summary:
        for s in container:
            seq = s.seq.upper()
            print(">%s\nA:%d\tC:%d\tG:%d\tT:%d\t-:%d" %
                  (s.label.decode('ASCII'), seq.count(b'A'), seq.count(b'C'),
                   seq.count(b'G'), seq.count(b'T'), seq.count(b'-')))

    if args.outfile:
        bioio.save(container, args.outfile, options=args.io_opts or [])
Пример #9
0
def main( args ):

    aaseqs = bioio.multisequence()

    for infile in args.files:

        mseq = bioio.load( infile, options = args.io_opts )
        cout('reading %d sequences from %s' % (len(mseq), infile))

        for seq in mseq:
            aaseq = seq.clone()
            aaseq.set_sequence( funcs.translated(seq, start_pos = args.start_codon ) )
            aaseqs.append( aaseq )

    bioio.save( aaseqs, args.outfile )
Пример #10
0
def statseq( args ):

    mseq = bioio.load( args.infile, options = args.io_opts or [] )

    for s in mseq:
        seq = s.seq.upper()
        A_ = seq.count(b'A')
        C_ = seq.count(b'C')
        G_ = seq.count(b'G')
        T_ = seq.count(b'T')
        N_ = seq.count(b'N')
        d_ = seq.count(b'-')
        L = A_ + C_ + G_ + T_ + N_ + d_

        cout('A: %3d  C: %3d  G: %3d  T: %3d  N: %3d  -: %3d  L: %3d  |  \t%s' % (A_, C_, G_, T_, N_, d_, L, s.label))
Пример #11
0
def geno2genediv( args ):

    lineparser = tabparser.GenotypeLineParser( args )
    lineparser.set_translator(lineparser.diploid_translator)

    # set group
    groups = lineparser.parse_grouping()

    cout('Grouping:')
    group_keys = sorted(groups.keys())
    for k in group_keys:
        cout(' %12s %3d' % (k, len(groups[k])))

    outfile = open(args.outfile, 'wt')
    outfile.write('CHROM\tPOS\tREGION\tN_SNP\tN_HAPLO\tFST\tdHe\tHe\tMEAN\tMEDIAN\tMAX\tMIN\t%s\n' %
    				'\t'.join( group_keys ))

    for idx, region in enumerate(lineparser.parse_genes()):
        haplotypes = set( region.haplotypes())
        enc_haplos = region.encode_haplotypes()
        haploarray = allel.HaplotypeArray( [enc_haplos] )

        cerr( 'I: calculating %d - %s' % (idx, region.name))

        # calculate total He first
        He = 1 - np.sum( haploarray.count_alleles().to_frequencies()**2 )

        # calculate He per population, He_p
        values = []
        pHe = 0
        for g in group_keys:

            he_p = 1 - np.sum(
                haploarray.count_alleles(subpop=groups[g]).to_frequencies()**2 )
            pHe += he_p * len(groups[g])
            values.append(he_p)

        dHe = He - pHe / sum( len(x) for x in groups.values() )
        FST = dHe/He

    	#print(idx, '%4d' % len(haplotypes), max(enc_haplos), region.name, value)
        params = ( FST, dHe, He, np.mean(values), np.median(values), np.max(values), np.min(values))
        outfile.write('%s\t%s\t%s\t%d\t%d\t%s\t%s\n' % (
                region.P[0][0], region.P[0][1], region.name, len(region.P), len(haplotypes),
                '\t'.join( '%5.4f' % x for x in params),
                '\t'.join( '%5.4f' % x for x in values)))
Пример #12
0
    def _filter_HetThreshold(self, snp_info, data_items):
        """ This filters the proportion of samples with heterozygote SNP at particular
            SNP position """

        if 'HetThreshold' in self.filters:

            # count heterozygosity
            hets = 0
            for (idx, data_item) in data_items:
                gt = data_item[0]
                if gt not in ['0/0', '1/1', '2/2', '3/3', '0', '1', '2', '3']:
                    hets += 1

            if hets / len(data_items) >= self.filters['HetThreshold']:
                cout('SNP ID: %s did not pass heterozygosity threshold.' %
                     snp_info[2])
                return False
        return True
Пример #13
0
def geno2pwfst(args):
    """ perform pair-wise FST by population """

    lineparser = tabparser.GenotypeLineParser(args)
    lineparser.set_translator(lineparser.diploid_translator)
    lineparser.parse_grouping()

    cout('Grouping:')
    groups = lineparser.groups
    for k in lineparser.groups:
        cout(' %12s %3d' % (k, len(lineparser.groups[k])))

    FST = []  # FST indexed by group_keys
    group_keys = sorted(lineparser.groups.keys())

    # read whole genotype, and release all unused memory
    cerr('I: reading genotype file')
    allel_array = lineparser.parse_all()
    cerr('I: generating genotype array')
    genoarray = allel.GenotypeArray(allel_array)
    del allel_array

    cerr('I: counting alleles')
    ac = {}
    for g in group_keys:
        ac[g] = genoarray.count_alleles(subpop=groups[g])

    cerr('I: calculating FST')
    M = np.zeros((len(group_keys), len(group_keys)))
    for (i, j) in itertools.permutations(range(len(group_keys)), 2):

        i_group = group_keys[i]
        j_group = group_keys[j]
        fst, _, _, _ = allel.stats.blockwise_hudson_fst(ac[i_group],
                                                        ac[j_group],
                                                        blen=10)
        M[i, j] = M[j, i] = fst

    with open(args.outfile, 'wt') as outfile:
        # write header:
        outfile.write('%s\n' % ('\t'.join(group_keys)))
        np.savetxt(outfile, M, delimiter='\t')

    return
Пример #14
0
def seq2pi(args):

    # open and read sequence file
    cerr('[I - reading sequence file %s]' % args.infile)
    seqs = load(args.infile)

    # open and read group/meta file using groupfile/metafile if available
    if args.groupfile or args.metafile:
        cerr('[I - reading group information file]')
        group_parser = grpparser.GroupParser(args)
        group_parser.parse()

        group_seqs = {}

        for seq in seqs:
            try:
                grp = group_parser.group_info[seq.label.decode('ASCII')]
            except KeyError:
                cerr('[W - sample %s is not assign to any group]' %
                     seq.label.decode('ASCII'))
                continue
            if grp in group_seqs:
                group_seqs[grp].append(seq)
            else:
                ms = multisequence()
                ms.append(seq)
                group_seqs[grp] = ms
    else:
        group_seqs = {'ALL': seqs}

    print('Groups:')
    outf = open(args.outfile, 'w') if args.outfile else None
    if outf:
        outf.write('GROUP\tN\tPI\tSTDDEV\n')
    for g in group_seqs:
        avg, stddev = calc_pi(group_seqs[g])
        cout('  %20s [%3d]: %f +- %f' % (g, len(group_seqs[g]), avg, stddev))
        if outf:
            outf.write('%s\t%d\t%5.4f\t%5.4f\n' %
                       (g, len(group_seqs[g]), avg, stddev))

    if outf:
        cerr('[I - result written to %s' % args.outfile)
Пример #15
0
def geno2genediv(args):

    lineparser = tabparser.GenotypeLineParser(args)
    lineparser.set_translator(lineparser.diploid_translator)

    # set group
    groups = lineparser.parse_grouping()

    cout('Grouping:')
    group_keys = sorted(groups.keys())
    for k in group_keys:
        cout(' %12s %3d' % (k, len(groups[k])))

    outfile = open(args.outfile, 'wt')
    outfile.write(
        'CHROM\tPOS\tREGION\tN_SNP\tN_HAPLO\tMEAN\tMEDIAN\tMAX\tMIN\t%s\n' %
        '\t'.join(group_keys))

    for idx, region in enumerate(lineparser.parse_genes()):
        haplotypes = set(region.haplotypes())
        enc_haplos = region.encode_haplotypes()
        assert len(haplotypes) == max(enc_haplos) + 1
        haploarray = allel.HaplotypeArray([enc_haplos])

        cerr('I: calculating %d - %s' % (idx, region.name))

        value = []
        for g in group_keys:
            ac_g = haploarray.count_alleles(subpop=groups[g])
            ac_ng = haploarray.count_alleles(
                subpop=list(lineparser.sample_idx - set(groups[g])))
            num, den = allel.stats.hudson_fst(ac_g, ac_ng)
            value.append(den)

        #print(idx, '%4d' % len(haplotypes), max(enc_haplos), region.name, value)
        params = (np.mean(value), np.median(value), np.max(value),
                  np.min(value))
        outfile.write('%s\t%s\t%s\t%d\t%d\t%s\t%s\n' %
                      (region.P[0][0], region.P[0][1], region.name,
                       len(region.P), len(haplotypes), '\t'.join(
                           '%5.4f' % x
                           for x in params), '\t'.join('%5.4f' % x
                                                       for x in value)))
Пример #16
0
    def _filter_MissingThreshold(self, snp_info, data_items):
        """ This filters the proportion of samples with missing SNP at particular
            SNP position """

        if 'MissingThreshold' in self.filters:

            # count missing haplotype
            missing = 0
            for (idx, data_item) in data_items:
                gt = data_item[0]
                if gt in ['./.', '.']:
                    missing += 1

            if missing / len(data_items) >= self.filters['MissingThreshold']:
                cout('SNP ID: %s did not pass missing threshold.' %
                     snp_info[2])
                return False

        return True
Пример #17
0
def geno2fst( args ):

    lineparser = tabparser.GenotypeLineParser( args )
    lineparser.set_translator(lineparser.diploid_translator)

    cout('Grouping:')
    groups = lineparser.parse_grouping()
    for k in groups:
        cout(' %12s %3d' % (k, len(groups[k])))

    FST = [] # FST indexed by group_keys
    group_keys = sorted(groups.keys())
    cout(group_keys)

    # output to file
    cout('Writing outfile...')
    outfile = open(args.outfile, 'w')

    outfile.write('CHROM\tPOS\tREGION\tMAX\tMEAN\tMEDIAN\tMAF\t%s\n' % '\t'.join(group_keys) )

    idx = 0
    for (posinfo, genolist) in lineparser.parse():

        idx += 1
        genoarray = allel.GenotypeArray( [genolist]  )

        # calculate MAF
        ac = genoarray.count_alleles()
        num = np.min(ac)
        denom = np.sum(ac)
        if num == denom:
            maf = 0
        else:
            maf = np.min(ac)/np.sum(ac)

        # calculate FST per group against other samples

        fst_sites = []
        for g in group_keys:
            ac_g = genoarray.count_alleles(subpop = groups[g])
            ac_ng = genoarray.count_alleles(subpop = list( lineparser.sample_idx - set(groups[g])))
            num, den = allel.stats.hudson_fst(ac_g, ac_ng)
            fst = num[0]/den[0]
            if not (0.0 <= fst <= 1.0):
                fst = 0
            fst_sites.append( fst )

        if idx % 100 == 0:
            cerr('I: writing position no %d' % idx)

        outfile.write('%s\t%s\t%s\t%5.4f\t%5.4f\t%5.4f\t%5.4f\t%s\n' %
                        (posinfo[0], posinfo[1], posinfo[4], np.max(fst_sites), np.mean(fst_sites), np.median(fst_sites), maf,
                            '\t'.join( '%5.4f' % x for x in fst_sites)))
Пример #18
0
def calc_fst(mseqs):

    groups = list(mseqs.keys())
    len_grp = len(groups)
    FST_mat = np.zeros((len_grp, len_grp))
    allele_counts = count_allele(mseqs)
    for i, j in itertools.combinations(range(len_grp), 2):

        ac1 = allele_counts[groups[i]]
        ac2 = allele_counts[groups[j]]

        with np.errstate(divide='ignore', invalid='ignore'):
            num, den = allel.hudson_fst(ac1, ac2)

            FST_mat[i, j] = np.nanmean(num / den)  #np.sum(num) / np.sum(den)
            FST_mat[j, i] = np.nanstd(num / den)

        cout('%5.4f +- %5.4f : %s <> %s' %
             (FST_mat[i, j], FST_mat[j, i], groups[i], groups[j]))

    return FST_mat, groups
Пример #19
0
def groupinfo(args):

    # open and read the first line of infile
    if args.fmt in ['pickle', 'npy']:

        from seqpy.core.bioio import naltparser
        from types import SimpleNamespace
        nalt_args = SimpleNamespace(infile=args.infile, fmt=args.fmt, n=-1)
        nalt_parser = naltparser.NAltLineParser(nalt_args,
                                                with_group=False,
                                                with_position=False)
        samples = nalt_parser.samples

    elif args.fmt == 'list':
        with gzopen(args.infile) as f:
            buf = f.read()
            samples = buf.split()

    else:
        with gzopen(args.infile) as f:
            samples = f.readline().strip().split()

    group_parser = grpparser.GroupParser(args)
    groups = group_parser.assign_groups(samples)
    total = 0
    cout('Groups:')
    for g in sorted(groups.keys()):
        c = len(groups[g])
        cout('  %3d - %s' % (c, g))
        total += c
    cout('Total: %d samples' % total)
Пример #20
0
def main():

    app = QtWidgets.QApplication(sys.argv)

    # patching seqpy.cout
    set_cout(writelog)
    cout('console log ready..')

    try:
        infile = sys.argv[1]
    except IndexError:
        infile = None

    w = IMainWindow()
    w.show()

    if infile:
        # allow all windows to be drawn
        QtCore.QTimer.singleShot(100, lambda: w.load(infile))
    else:
        w.setFocus()
    app.exec_()
Пример #21
0
def main(args):

    container = None

    for infile in args.files:

        obj = bioio.load(infile, options=args.io_opts or [])
        cout('reading %d sequences from %s' % (len(obj), infile))
        if container is None:
            container = obj
        else:
            container += obj

    append_attributes(container, args.src, args.src_isolate, args.definition)

    if args.summary:
        for s in container:
            seq = s.seq.upper()
            print(">%s\nA:%d\tC:%d\tG:%d\tT:%d\t-:%d" %
                  (s.label.decode('ASCII'), seq.count(b'A'), seq.count(b'C'),
                   seq.count(b'G'), seq.count(b'T'), seq.count(b'-')))

    if args.outfile:
        bioio.save(container, args.outfile, options=args.io_opts or [])
Пример #22
0
def geno2pairfst( args ):

    lineparser = tabparser.GenotypeLineParser( args )
    lineparser.set_translator(lineparser.diploid_translator)

    cout('Grouping:')
    groups = lineparser.parse_grouping()
    for k in groups:
        cout(' %12s %3d' % (k, len(groups[k])))

    FST = [] # FST indexed by group_keys
    group_keys = sorted(groups.keys())
    cout(group_keys)

    # gathering groups
    grp1 = list(itertools.chain.from_iterable(
        groups[k] for k in args.grp1.split(',')
    ))
    grp2 = list(itertools.chain.from_iterable(
        groups[k] for k in args.grp2.split(',')
    ))
 
    # output to file
    FST = []


    idx = 0
    for (posinfo, genolist) in lineparser.parse():

        idx += 1
        genoarray = allel.GenotypeArray( [genolist]  )


        # calculate FST per group against other samples

        ac_g1 = genoarray.count_alleles(subpop = grp1)
        ac_g2 = genoarray.count_alleles(subpop = grp2)
        num, den = allel.stats.hudson_fst(ac_g1, ac_g2)
        fst = num[0]/den[0]
        if not (0.0 <= fst <= 1.0):
                fst = 0
        FST.append( (fst, posinfo) )

    FST.sort(reverse=True)
    for fst, posinfo in FST[:10]:
        cout('%s\t%s\t%s\t%5.4f' % (posinfo[0], posinfo[1], posinfo[4], fst))
Пример #23
0
    def file_open(self, filename=None):

        if not filename:
            filename, file_filter = QtWidgets.QFileDialog.getOpenFileName( self.pane(),
                    "Open project, alignment or trace file" )

        if not filename:
            return

        cout("Loading file %s" % filename)
        if not os.path.exists( filename ):
            alert('File %s does not exists. Please check your filename!' % filename)
            return

        b = progress('Opening ' + filename)
        b.repaint()
        obj = bioio.load( filename )
        b.hide()
        del b

        if obj:
            self.view(obj)
        else:
            alert('Error reading file ' + filename +'\nUnknown file format!')
Пример #24
0
def geno2dhe(args):

    lineparser = tabparser.GenotypeLineParser(args)
    lineparser.set_translator(lineparser.haploid_translator)
    lineparser.parse_grouping()

    cout('Grouping:')
    groups = lineparser.groups
    for k in lineparser.groups:
        cout(' %12s %3d' % (k, len(lineparser.groups[k])))

    group_keys = sorted(lineparser.groups.keys())
    cout(group_keys)

    # read whole genotype, and release all unused memory
    cerr('I: reading genotype file')
    allel_array = lineparser.parse_all()
    cerr('I: generating genotype array')
    genoarray = allel.GenotypeArray(allel_array)
    del allel_array

    cerr('I: calculating He')
    He = 1 - np.sum(genoarray.count_alleles().to_frequencies()**2, axis=1)

    He_groups = {}
    pHe = None

    for g in groups:
        He_groups[g] = 1 - np.sum(
            genoarray.count_alleles(subpop=groups[g]).to_frequencies()**2,
            axis=1)
        if pHe is None:
            pHe = He_groups[g] * len(groups[g])
        else:
            pHe = pHe + He_groups[g] * len(groups[g])

    dHe = He - pHe / sum(len(x) for x in groups.values())
    FST = dHe / He

    #import IPython; IPython.embed()

    cerr('I: writing output file')
    with open(args.outfile, 'wt') as outfile:
        outfile.write('CHROM\tPOS\tREGION\tFST\tdHe\tHe\t%s\n' %
                      '\t'.join(group_keys))

        for i in range(len(He)):
            posinfo = lineparser.position[i]

            outfile.write('%s\t%s\t%s\t%5.4f\t%5.4f\t%5.4f\t%s\n' %
                          (posinfo[0], posinfo[1], posinfo[4], FST[i], dHe[i],
                           He[i], '\t'.join('%5.4f' % He_groups[g][i]
                                            for g in group_keys)))
Пример #25
0
def vcf2seq(args):

    vcf2seqhelper = VCF2SeqHelper(
        args.vcffile, args.chr,
        'NoIndel,LowQual,MissingThreshold=0.05,HetThreshold=0.25,' + args.opts)
    vcf2seqhelper.parse()
    mseq = vcf2seqhelper.get_multisequence()
    cout('Report:')
    for k, v in vcf2seqhelper.chr_used.items():
        cout(' %s\t%d' % (k, v))
    cout('Writing to %s' % args.outfile)
    bioio.save(mseq, args.outfile)
Пример #26
0
def geno2dxy(args):

    lineparser = tabparser.GenotypeLineParser(args)
    lineparser.set_translator(lineparser.haploid_translator)
    lineparser.parse_grouping()

    cout('Grouping:')
    groups = lineparser.groups
    for k in lineparser.groups:
        cout(' %12s %3d' % (k, len(lineparser.groups[k])))

    group_keys = sorted(lineparser.groups.keys())
    cout(group_keys)

    # read whole genotype, and release all unused memory
    cerr('I: reading genotype file')
    allel_array = lineparser.parse_all()
    cerr('I: generating genotype array')
    genoarray = allel.GenotypeArray(allel_array)
    del allel_array

    cerr('I: generating allele count array')
    gac = genoarray.to_allele_counts()

    cerr('I: calculating pairwise dxy')
    c_distm = allel.pairwise_dxy(range(len(gac)), gac)
    distm = scipy.spatial.distance.squareform(c_distm)

    #import IPython
    #IPython.embed()

    cerr('I: writing to outfile')
    with open(args.outfile, 'wb') as outfile:
        outfile.write(lineparser.get_sample_header(True))
        outfile.write(b'\n')

        # write the matrix
        np.savetxt(outfile, distm, delimiter='\t')  #, fmt='%.5f')
Пример #27
0
def geno2hierfst(args):

    genoparser = tabparser.GenotypeLineParser(args)
    genoparser.set_translator(genoparser.diploid_translator)

    cerr('Grouping:')
    groups = genoparser.parse_grouping()
    for k in groups:
        cout(' %12s %3d' % (k, len(groups[k])))

    hierarchy = []
    with open(args.hierfile) as hierfile:
        for line in hierfile:
            line = line.strip()
            if not line: continue
            if line.startswith('#'): continue
            partitions = line.split('\t')
            print(partitions)
            par1 = list(
                itertools.chain.from_iterable(
                    groups[k] for k in partitions[0].split(',')))
            par2 = list(
                itertools.chain.from_iterable(
                    groups[k] for k in partitions[1].split(',')))
            hierarchy.append((par1, par2))
    cerr('[I: preparing %d hierarchy]' % len(hierarchy))

    cerr('[I: reading genotype file...]')
    genotypes = genoparser.parse_all()
    genoarray = allel.GenotypeArray(genotypes)
    #import IPython; IPython.embed()
    del genotypes

    selected_positions = []
    c = 1
    for (grp1, grp2) in hierarchy:
        cerr('[I: processing hierarchy #%d]' % c)
        FST = []
        ac_g1 = genoarray.count_alleles(subpop=grp1)
        ac_g2 = genoarray.count_alleles(subpop=grp2)
        #import IPython; IPython.embed()
        num, den = allel.stats.hudson_fst(ac_g1, ac_g2)
        fst = num / den

        for p, v in zip(genoparser.position, fst):
            if not (0.0 <= v <= 1.0):
                v = 0
            FST.append((v, p))
        FST.sort(reverse=True)
        cumulative_fst = 0.0
        for (v, p) in FST:
            if v < args.minfst:
                break
            if cumulative_fst > args.cumfst:
                break
            selected_positions.append((p, v))
            cumulative_fst += v
        c += 1

    for (p, v) in selected_positions:
        cout('%s\t%s\t%s\t%5.4f' % (p[0], p[1], p[4], v))
Пример #28
0
    def parse_samples(self, snp_info, data_items):

        if not super().parse_samples(snp_info, data_items):
            return

        (chrom, pos, posid, ref, alt, qual, filters, info, format) = snp_info

        if 'MissingThreshold' in self.filters:

            # count missing haplotype
            missing = 0
            for (idx, data_item) in data_items:
                gt = data_item[0]
                if gt == './.':
                    missing += 1

            if missing / len(data_items) >= 0.05:
                cout('SNP ID: %s did not pass missing threshold.' % posid)
                return

        if 'HetThreshold' in self.filters:

            # count heterozygosity
            hets = 0
            for (idx, data_item) in data_items:
                gt = data_item[0]
                if gt not in ['0/0', '1/1', '2/2']:
                    hets += 1

            if hets / len(data_items) >= 0.33:
                cout('SNP ID: %s did not pass heterozygosity threshold.' %
                     posid)
                return

        if 'MAF' in self.filters:

            # count MAF
            refs = 0
            for (idx, data_item) in data_items:
                gt = data_item[0]
                if gt == '0/0':
                    refs += 1
            maf = refs / len(data_items)
            if maf > 0.5:
                maf = 1 - maf
            if maf < self.filters['MAF']:
                print('SNP ID: %s did not pass MAF threshold.' % posid)
                return

        for (idx, data_item) in data_items:
            gt = data_item[0]
            if gt == '0/0':
                self.mseq[idx].append(ref[0])
            elif gt == '1/1':
                self.mseq[idx].append(alt[0])
            elif gt == '2/2':
                self.mseq[idx].append(alt[1])
            else:
                self.mseq[idx].append(ord('N'))

        # reporting purposes
        self.chr_used[chrom] += 1
Пример #29
0
def consolidate_predictions(args):

    outreport = None

    if args.samplefile:
        samples = read_samplefile(args.samplefile, args.fmt)
    else:
        samples = None

    group_parser = grpparser.GroupParser(args)
    group_parser.assign_groups(samples)
    #group_parser.group_keys contains [ 'grp1', 'grp2', etc]
    group_keys = group_parser.group_keys

    with open(args.infile, 'rb') as f:
        predictions = pickle.load(f)

    if args.outreport:
        outreport = open(args.outreport, 'wb')
        from sklearn.metrics import confusion_matrix
    reports = {}

    normalize = True

    for model in predictions:
        model_pred = predictions[model]

        for k in model_pred:

            cerr('Preparing for model: {} k: {}'.format(model, k))
            df = generate_dataframe(model_pred[k])

            group_indexes = np.argmax(df.values, axis=1)
            group_predictions = df.columns[group_indexes[:, None]]
            for i in range(len(group_indexes)):
                predicted_group = df.columns[group_indexes[i]]
                prediction_confidence = df.values[i, group_indexes[i]]
                if prediction_confidence < args.threshold or predicted_group != group_keys[
                        i]:
                    cout('{}: {} -> {} ({})'.format(samples[i], group_keys[i],
                                                    predicted_group,
                                                    prediction_confidence))

            if outreport:

                score = lkmodels.calculate_scores(group_keys,
                                                  group_predictions)
                confmat = confusion_matrix(group_keys, group_predictions)

                if normalize:
                    confmat = confmat.astype('float') / confmat.sum(
                        axis=1)[:, np.newaxis]
                    cerr("[I - Normalized confusion matrix]")
                else:
                    cerr('[I - Confusion matrix, without normalization]')

                reports['{}|{}'.format(model, k)] = {
                    'score': score,
                    'confmat': confmat
                }

    if outreport:
        pickle.dump(reports, outreport)
        cerr('[I - writing pickled report to {}]'.format(args.outreport))
Пример #30
0
def main(args):

    mseq = bioio.load(args.infile, options=args.io_opts)
    cout('reading %d sequences from %s' % (len(mseq), args.infile))
    bioio.save(funcs.condensed(mseq), args.outfile)