def _run(self):
        with base.connect() as con:
            c = con.cursor()

            q = 'select year, count(old) as n from ('
            q += 'SELECT old, idx, new, pub, min(year) as year'
            q += ' FROM report inner join publication'
            q += ' on report.pub == publication.key'
            q += ' where pub != "exac" and year != 2016'
            q += ' group by idx, new)'
            q += ' group by year'
            filename = self.data_out('first-reports.csv')
            print('Writing data to ' + filename)
            with open(filename, 'w') as f:
                w = self.csv_writer(f)
                w.writerow(['year', 'first_reports'])
                for row in c.execute(q):
                    w.writerow([row['year'], row['n']])

            q = 'select year, count(old) as n from ('
            q += 'SELECT old, idx, new, pub, year'
            q += ' FROM epdata inner join publication'
            q += ' on epdata.pub == publication.key'
            q += ' where year != 2016)'
            q += ' group by year'
            filename = self.data_out('epdata-reports.csv')
            print('Writing data to ' + filename)
            with open(filename, 'w') as f:
                w = self.csv_writer(f)
                w.writerow(['year', 'epdata_reports'])
                for row in c.execute(q):
                    w.writerow([row['year'], row['n']])
    def expected(self, simple_weights=False):
        """
        Returns a 2d matrix of expected amino acid substitutions ratios.

        Uses weights based on single nucleotide substitutions.

        By default, these take into account rates derived from the human
        genome. To disable this, set simple_weights = True.
        """
        acids, acid_map = self.acids()
        n = len(acids)

        matrix = np.zeros((n, n))
        with base.connect() as con:
            c = con.cursor()
            q = 'select old, new, '
            q += 'simple_weight as weight' if simple_weights else 'weight'
            q += ' from mutation_possible'
            for k, row in enumerate(c.execute(q)):
                old = acid_map[row['old']]
                new = acid_map[row['new']]
                matrix[old, new] += row['weight']
                if old == new:
                    print(row['old'], row['new'])

        return matrix / np.sum(matrix)
    def _run(self):
        with base.connect() as con:
            c = con.cursor()

            # Load all epdata mentions
            q = 'select old, idx, new, pub from epdata'
            epdata_mentions = set()
            for old, idx, new, pub in c.execute(q):
                epdata_mentions.add((old, idx, new, pub))

            # Load all reports
            q = 'select old, idx, new, pub from report'
            reports = set()
            for old, idx, new, pub in c.execute(q):
                reports.add((old, idx, new, pub))

            # Compare
            diff = epdata_mentions - reports
            if diff:
                print('-' * 60)
                print('EPData establishes mutation-publication links not'
                      ' mentioned in report table')
                print('-' * 60)
                for old, idx, new, pub in diff:
                    print(old + str(idx) + new + ' : ' + pub)
                import sys
                sys.exit(1)
            print('[ok] EPData links all listed in reports')
示例#4
0
def main():
    # Create drones
    drones.main()
    
    ######################################################
    ## DOMAINS CONFIGURATION                            ##
    ######################################################
    for i in xrange(0, 18):
        hosts.add_host('target%i' % i, 'deploy')
#    for i in xrange(0,6):
#        hosts.add_host('srv%i' % i, 'deploy')
#    hosts.add_host('storage0', 'deploy')
#    hosts.add_host('storage1', 'deploy')
#    hosts.add_host('monitor0', 'deploy')
#    hosts.add_host('monitor1', 'deploy')
#    hosts.add_host('load0', 'deploy')
#    hosts.add_host('load1', 'deploy')
#    hosts.add_host('test0', 'deploy')
#    hosts.add_host('test1', 'deploy')
    ######################################################
    
    # Connect with all drone relays
    hosts_map = hosts.get_hosts_list()
    dlist = base.connect(hosts_map)
        
    # Wait for all connections
    wait = defer.DeferredList(dlist)
    wait.addCallback(deploy_phase)
    
    # Start the Twisted reactor
    reactor.run()
 def _run(self):
     with base.connect() as con:
         c = con.cursor()
         q = 'select pub, count(idx) as muts from report group by pub'
         q += ' order by muts desc'
         for row in c.execute(q):
             if row['muts'] < 5:
                 break
             print(str(row['muts']) + ' :: ' + row['pub'])
示例#6
0
 def gather(self, q, fname1, fname2):
     # Query db
     na, ni = [], []
     stda, stdi = [], []
     sema, semi = [], []
     with base.connect() as con:
         c = con.cursor()
         for row in c.execute(q):
             if row['na'] > 0:
                 na.append(row['na'])
                 stda.append(row['stda'])
                 sema.append(row['sema'])
             if row['ni'] > 0:
                 ni.append(row['ni'])
                 stdi.append(row['stdi'])
                 semi.append(row['semi'])
     # Calculate
     stda = np.array(stda)
     stdi = np.array(stdi)
     print('std a mean: ' + str(np.mean(stda)))
     print('std i mean: ' + str(np.mean(stdi)))
     print('2sr a mean: ' + str(4*np.mean(stda)))
     print('2sr i mean: ' + str(4*np.mean(stdi)))
     print('std a min: ' + str(np.min(stda)))
     print('std a max: ' + str(np.max(stda)))
     print('std i min: ' + str(np.min(stdi)))
     print('std i max: ' + str(np.max(stdi)))
     # Debug plot
     if False:
         pl.figure()
         pl.subplot(2,1,1)
         pl.plot(na, stda, 'o')
         pl.xlabel('na')
         pl.ylabel('stda')
         pl.subplot(2,1,2)
         pl.plot(ni, stdi, 'o')
         pl.xlabel('ni')
         pl.ylabel('stdi')
         pl.show()
     # Write files
     fname1 = self.data_out(fname1)
     print('Writing to ' + fname1)
     with open(fname1, 'w') as f:
         csv = self.csv_writer(f)
         csv.writerow(['na', 'stda'])
         data = []
         for k, n in enumerate(na):
             csv.writerow((n, stda[k]))
     fname2 = self.data_out(fname2)
     print('Writing to ' + fname2)
     with open(fname2, 'w') as f:
         csv = self.csv_writer(f)
         csv.writerow(['ni', 'stdi'])
         data = []
         for k, n in enumerate(ni):
             csv.writerow((n, stdi[k]))
     print('Done')
示例#7
0
 def _run(self):
     with base.connect() as con:
         c = con.cursor()
         # Get positions (may have gaps!)
         idx = []
         for r in c.execute('select idx from scn5a order by idx'):
             idx.append(r[0])
         idx = np.array(idx)
         # Get mutations for each position
         q = 'select distinct idx, new from report where pub != "exac"'
         mut = np.zeros(idx.shape)
         for r in c.execute(q):
             mut[r[0] - 1] = 1  # Positions start at 1
         # Get Human-squid-eel and domain alignment score
         hse = np.zeros(idx.shape, dtype=float)
         dom = np.zeros(idx.shape, dtype=float)
         q = 'select idx, hse, dom from conservedness order by idx'
         for k, r in enumerate(c.execute(q)):
             assert(r[0] == 1 + k) # Score should be stored for each idx
             hse[k] = r[1]
             dom[k] = r[2]
         #
         # 1. Sliding window averages for mutation count, hse score and dom
         #    score.
         #
         radius = 5
         ms = window(mut, radius)
         hs = window(hse, radius)
         ds = window(dom, radius)
         basename = 'windowed-averages'
         filename = self.data_out(basename + '.txt')
         print('Writing info to ' + filename)
         with open(filename, 'w') as f:
             f.write('Mutation count, human-squid-eel and domain-alignment'
                 ' score were measured per position, and then averaged'
                 ' using a sliding window with radius ' + str(radius)
                 + ' leading to a window size of ' + str(1 + 2 * radius)
                 + '. At the borders, where the window is smaller, the'
                 ' average is computed by dividing through the effective'
                 ' window size at that point.')
         filename = self.data_out(basename + '.csv')
         print('Writing data to ' + filename)
         with open(filename, 'w') as f:
             c = self.csv_writer(f)
             c.writerow([
                 'position',
                 'mutation_count',
                 'hse_score',
                 'dom_score',
                 ])
             hs = iter(hs)
             ds = iter(ds)
             for k, m in enumerate(ms):
                 c.writerow([k+1, m, next(hs), next(ds)])
示例#8
0
def connect_next():
    global index
    
    if index >= len(hosts):
        print 'Successful'
        reactor.stop()
        return
    
    host = hosts[index]
    dlist = base.connect((host,))
    dlist[0].addCallback(connected, host)
    dlist[0].addErrback(errback, host)
    def _run(self):
        """
        Creates the weka files!
        """
        with base.connect() as connection:
            filename = self.data_out('dva-disc.arff')
            self.create_discretized(
                connection,
                filename,
                dva=True,
                moments=False,
                deltas_only=True,
            )

            filename = self.data_out('dvi-disc.arff')
            self.create_discretized(
                connection,
                filename,
                dva=False,
                moments=False,
                deltas_only=True,
            )

            filename = self.data_out('zero.arff')
            self.create_class(connection,
                              filename,
                              output='zero',
                              deltas_only=True)

            filename = self.data_out('act.arff')
            self.create_class(connection,
                              filename,
                              output='act',
                              deltas_only=True)

            filename = self.data_out('inact.arff')
            self.create_class(connection,
                              filename,
                              output='inact',
                              deltas_only=True)

            filename = self.data_out('late.arff')
            self.create_class(connection,
                              filename,
                              output='late',
                              deltas_only=True)

            filename = self.data_out('changed.arff')
            self.create_class(connection,
                              filename,
                              output='changed',
                              deltas_only=True)
    def observed(self):
        """
        Returns a matrix of observed amino acid substituion ratios.
        """
        acids, acid_map = self.acids()
        n = len(acids)

        matrix = np.zeros((n, n), dtype=float)
        with base.connect() as con:
            c = con.cursor()
            q = 'select distinct old, idx, new from report where pub != "exac"'
            for k, row in enumerate(c.execute(q)):
                old = acid_map[row['old']]
                new = acid_map[row['new']]
                matrix[old, new] += 1

        return matrix / np.sum(matrix)
示例#11
0
def main():
    # Create drones
    drones.main()
    
    # Add hosts
    hosts.add_host('load0', 'deploy')
    hosts.add_host('load1', 'deploy')
    
    # Connect with all drone relays
    hosts_map = hosts.get_hosts_list()
    dlist = base.connect(hosts_map)
        
    # Wait for all connections
    wait = defer.DeferredList(dlist)
    wait.addCallback(deploy_phase)
    
    # Start the Twisted reactor
    reactor.run()
示例#12
0
def main():
    # Create drones
    drones.main()
    
    # Add hosts
    hosts.add_host('192.168.96.6', 'action')
        
    hosts_map = hosts.get_hosts_list()
    
    # Connect with all drone relays
    deferList = base.connect(hosts_map)
    wait = defer.DeferredList(deferList)
    
    # Decide what to do after connection setup
    wait.addCallback(test0)
    
    # Start the Twisted reactor
    reactor.run()
示例#13
0
 def gather(self, q):
     # Query db
     tmin, tmax, tmid = [], [], []
     with base.connect() as con:
         c = con.cursor()
         for row in c.execute(q):
             tmin.append(row['tmin'])
             tmax.append(row['tmax'])
             tmid.append(0.5 * (row['tmin'] + row['tmax']))
     # Create list of tmin, tmax, tmid
     tmin = np.array(tmin)
     tmax = np.array(tmax)
     tmid = np.array(tmid)
     print('TMid, mean: ' + str(np.mean(tmin)))
     print('TMid, std : ' + str(np.std(tmid)))
     print('2Sigma range: ' + str(4 * np.std(tmid)))
     print('Min, max: ' + str(np.min(tmin)) + ', ' + str(np.max(tmax)))
     # Write file
     print('Done')
示例#14
0
def main():
    # Create drones
    drones.main()
    
    # Add hosts
    for i in xrange(0, 18):
        hosts.add_host('target%i' % i, 'action')
    
    # Connect with all drone relays
    hosts_map = hosts.get_hosts_list()
    dlist = base.connect(hosts_map)
        
    # Wait for all connections
    wait = defer.DeferredList(dlist)
    wait.addCallback(deploy_phase)
    wait.addErrback(error)
    
    # Start the Twisted reactor
    reactor.run()
 def _run(self):
     with base.connect() as con:
         c = con.cursor()
         # Retrieve all locations
         indices = []
         q = 'select * from scn5a_isoform_b order by idx'
         for row in c.execute(q):
             indices.append(row['idx'])
         locations = {}
         for row in c.execute('select * from scn5a_diagram order by idx'):
             locations[row['idx']] = (row['x'], row['y'])
         # Write to file
         filename = self.data_out('diagram_isoform_b.csv')
         print('Writing ' + filename)
         with open(filename, 'w') as f:
             csv = self.csv_writer(f)
             csv.writerow(['idx', 'x', 'y'])
             for idx in indices:
                 x, y = locations[idx]
                 csv.writerow([idx, x, y])
示例#16
0
 def _run(self):
     with base.connect() as con:
         c = con.cursor()
         # Load known shifts
         print('Loading voltage shift data')
         q = 'select * from epdata_filtered'
         q += ' where dva is not null and dvi is not null'
         mutations = []
         for row in c.execute(q):
             mutations.append([
                 int(row['idx']),
                 float(row['dva']),
                 float(row['dvi']),
             ])
         # Load conservedness
         doms = {}
         hses = {}
         for row in c.execute('select * from conservedness'):
             doms[row['idx']] = row['dom']
             hses[row['idx']] = row['hse']
         # Store for graphing
         filename = self.data_out('voltage-shift-conservedness.csv')
         print('Writing data to ' + filename)
         with open(filename, 'w') as f:
             w = self.csv_writer(f)
             w.writerow([
                 'idx',
                 'dom',
                 'hse',
                 'dva',
                 'dvi',
                 'window',
             ])
             for mutation in mutations:
                 idx, dva, dvi = mutation
                 dom = doms[idx]
                 hse = hses[idx]
                 window = dvi - dva
                 w.writerow([idx, dom, hse, dva, dvi, window])
         print('Done')
示例#17
0
    def _run(self):
        with base.connect() as con:
            c = con.cursor()

            totals = [0] * 5

            def show(letter=None):
                q = 'select count(pub) from midpoints_wt'
                if letter:
                    q += ' where sequence = "' + letter + '"'
                else:
                    q += ' where sequence is null'
                n = next(c.execute(q))[0]
                totals[0] += n
                if letter:
                    print('Isoform ' + letter + ': ' + str(n))
                else:
                    print('Unknown : ' + str(n))
                n = next(c.execute(q + ' and beta1="yes"'))[0]
                totals[1] += n
                print('  with b1 : ' + str(n))
                n = next(c.execute(q + ' and cell="HEK"'))[0]
                totals[2] += n
                print('      HEK : ' + str(n))
                n = next(c.execute(q + ' and cell="Oocyte"'))[0]
                totals[3] += n
                print('   Oocyte : ' + str(n))
                n = next(c.execute(q + ' and cell="CHO"'))[0]
                totals[4] += n
                print('      CHO : ' + str(n))
                print('')

            show('a')
            show('b')
            show('astar')
            show('bstar')
            show(None)

            print('Totals: ' + str(totals))
 def _run(self):
     with base.connect() as con:
         c = con.cursor()
         print('Loading voltage shift data')
         mutations = []
         for row in c.execute('select * from epdata'):
             idx = row['idx']
             dva = row['dva']
             dvi = row['dvi']
             dva = float(dva) if dva is not None else dva
             dvi = float(dvi) if dvi is not None else dvi
             mutations.append([idx, dva, dvi])
         filename = self.data_out('voltage-shift-indices.csv')
         print('Writing ' + filename)
         with open(filename, 'w') as f:
             w = self.csv_writer(f)
             w.writerow([
                 'idx',
                 'dva',
                 'dvi',
             ])
             for mutation in mutations:
                 w.writerow(mutation)
         print('Done')
示例#19
0
 def _run(self):
     with base.connect() as con:
         c = con.cursor()
         # Get scn5a isoform b indices
         indices = []
         q = 'select * from scn5a_isoform_b order by idx'
         for row in c.execute(q):
             indices.append(row['idx'])
         isoform_b = set(indices)
         # Get a map from index to diagram location
         locations = {}
         for row in c.execute('select * from scn5a_diagram order by idx'):
             locations[row['idx']] = (row['x'], row['y'])
         # Get scn5a indices matched to a NavAb acid, write to file
         filename = self.data_out('diagram-navab-cover.csv')
         with open(filename, 'w') as f:
             csv = self.csv_writer(f)
             csv.writerow(('idx', 'x', 'y'))
             q = 'select scn5a from navab_to_scn5a order by scn5a;'
             for row in c.execute(q):
                 idx = row['scn5a']
                 if idx in isoform_b:
                     x, y = locations[idx]
                     csv.writerow((idx, x, y))
    def acids(self):
        """
        Returns a list of amino acids and a reverse lookup dict.

        (So a list [A, R, N, ...] and a dict {A:0, R:1, N:2, ...}).
        """
        if self._acids is None:
            if False:
                acids = []
                acid_map = {}
                with base.connect() as con:
                    c = con.cursor()
                    q = 'select key from acid order by rowid'
                    for k, row in enumerate(c.execute(q)):
                        acids.append(row['key'])
                        acid_map[row['key']] = k
                self._acids = acids
                self._acid_map = acid_map
            else:
                self._acids = 'KDERHQNSPTGACWFLIMYV'
                assert (len(self._acids) == 20)
                self._acid_map = dict(zip(self._acids,
                                          range(len(self._acids))))
        return self._acids, self._acid_map
    def _run(self):
        with base.connect() as con:
            c = con.cursor()

            # Change selecting queries
            condition_names = [
                'zero', 'act', 'inact', 'late', 'changed', 'unchanged',
            ]
            conditions = [
                'zero > 0',
                'act > 0',
                'inact > 0',
                'late > 0',
                # Changed / no change reported:
                '(zero > 0 or act > 0 or inact > 0 or late > 0)',
                '(zero < 1 and act < 1 and inact < 1 and late < 1)',
            ]

            #
            # Number of changes per domain
            #

            # Domain names
            domain_names = [
                r['name'] for r in c.execute('select name from domain')]

            # Count number of each type of change per domain
            # Note that epdata_annotated contains each unique mutation only
            # once (although its fields such as `act` show a sum of votes for
            # whether or not it was affected, based on all available reports).
            domain_counts = []
            for name in domain_names:
                q = 'select count(idx) from epdata_annotated'
                q += ' where domain = "' + name + '"'
                counts = []
                for condition in conditions:
                    qc = q + ' and ' + condition
                    counts.append(c.execute(qc).fetchone()[0])
                domain_counts.append(counts)

            # Write files
            filename = self.data_out('changes-1-per-domain.csv')
            print('Writing data to ' + filename)
            with open(filename, 'w') as f:
                w = self.csv_writer(f)
                w.writerow(['xaxis', 'xlabel'] + condition_names)
                for k, name in enumerate(domain_names):
                    w.writerow([k + 1, name] + domain_counts[k])

            filename = self.data_out('changes-relative-1-per-domain.csv')
            print('Writing data to ' + filename)
            with open(filename, 'w') as f:
                w = self.csv_writer(f)
                w.writerow(['xaxis', 'xlabel'] + condition_names)
                for k, name in enumerate(domain_names):
                    r = np.array(domain_counts[k])
                    s = np.sum(r) or 1
                    w.writerow([k + 1, name] + list(r / s))

            #
            # Number of changes per region type
            #

            # Region type names
            regtype_names = [
                r['name'] for r in c.execute('select name from regtype')]

            # Count number of each type of change per regtype
            regtype_counts = []
            for name in regtype_names:
                q = 'select count(idx) from epdata_annotated'
                q += ' where regtype = "' + name + '"'
                counts = []
                for condition in conditions:
                    qc = q + ' and ' + condition
                    counts.append(c.execute(qc).fetchone()[0])
                regtype_counts.append(counts)

            # Write files
            filename = self.data_out('changes-2-per-regtype.csv')
            print('Writing data to ' + filename)
            with open(filename, 'w') as f:
                w = self.csv_writer(f)
                w.writerow(['xaxis', 'xlabel'] + condition_names)
                for k, name in enumerate(regtype_names):
                    w.writerow([k + 1, name] + regtype_counts[k])

            filename = self.data_out('changes-relative-2-per-regtype.csv')
            print('Writing data to ' + filename)
            with open(filename, 'w') as f:
                w = self.csv_writer(f)
                w.writerow(['xaxis', 'xlabel'] + condition_names)
                for k, name in enumerate(regtype_names):
                    r = np.array(regtype_counts[k])
                    s = np.sum(r) or 1
                    w.writerow([k + 1, name] + list(r / s))

            #
            # Number of changes per region
            #

            # Region names
            q = 'select name from region order by start'
            region_names = [r['name'] for r in c.execute(q)]

            # Number of mutations per region
            region_counts = []
            for name in region_names:
                q = 'select count(idx) from epdata_annotated'
                q += ' where region = "' + name + '"'
                counts = []
                for condition in conditions:
                    qc = q + ' and ' + condition
                    counts.append(c.execute(qc).fetchone()[0])
                region_counts.append(counts)

            # Write files
            filename = self.data_out('changes-3-per-region.csv')
            print('Writing data to ' + filename)
            with open(filename, 'w') as f:
                w = self.csv_writer(f)
                w.writerow(['xaxis', 'xlabel'] + condition_names)
                for k, name in enumerate(region_names):
                    w.writerow([k + 1, name] + region_counts[k])

            filename = self.data_out('changes-relative-3-per-region.csv')
            print('Writing data to ' + filename)
            with open(filename, 'w') as f:
                w = self.csv_writer(f)
                w.writerow(['xaxis', 'xlabel'] + condition_names)
                for k, name in enumerate(region_names):
                    r = np.array(region_counts[k])
                    s = np.sum(r) or 1
                    w.writerow([k + 1, name] + list(r / s))
    def make(self, alpha):
        """
        Alpha=True, make file for alpha subunits
        Alpha=False, make file for beta subunits
        """
        with base.connect() as con:
            c = con.cursor()
            
            # Get years to find data for
            q = 'select min(year)'
            q += ' from epdata inner join publication'
            q += ' on epdata.pub = publication.key'
            year1 = next(c.execute(q))[0]
            years = range(year1, 2016)

            # Gather data
            data = {}            
            if alpha:
                # Alpha subunits
                names = []
                q = 'select distinct sequence from epdata'
                for row in c.execute(q):
                    names.append(row['sequence'])
                names.remove(None)
                q = 'select year, count(old) as n'
                q += ' from epdata inner join publication'
                q += ' on epdata.pub = publication.key'
                q += ' where sequence = ? and year < 2016'
                q += ' group by year'
                for name in names:
                    ydata = [0]*len(years)
                    for row in c.execute(q, (name,)):
                        ydata[row['year'] - year1] = row['n']
                    data[name] = ydata
                q = 'select year, count(old) as n'
                q += ' from epdata inner join publication'
                q += ' on epdata.pub = publication.key'
                q += ' where sequence is null and year < 2016'
                q += ' group by year'
                ydata = [0]*len(years)
                for row in c.execute(q):
                    ydata[row['year'] - year1] = row['n']
                data['Unknown'] = ydata
                names.append('Unknown')
                
                # Tweak order
                tweak = [
                    'achen',
                    'bstar',
                    'astar',
                    'b',
                    'a',
                    'Unknown',                    
                    ]
                if set(names) != set(tweak):
                    raise Exception('Tried to tweak order of lines, but custom'
                        ' order is lacking some values! Should have: '
                        + ','.join(names))
                names = tweak
            else:
                # Beta subunits
                names = ['yes', 'no']
                q = 'select year, count(old) as n'
                q += ' from epdata inner join publication'
                q += ' on epdata.pub = publication.key'
                q += ' where beta1 = ? and year < 2016'
                q += ' group by year'
                for name in names:
                    ydata = [0]*len(years)
                    for row in c.execute(q, (name,)):
                        ydata[row['year'] - year1] = row['n']
                    data[name] = ydata
            
            # Filenames
            if alpha:
                basename = 'alpha_per_year'
            else:
                basename = 'beta1_per_year'
            
            # Write output
            iters = []
            iters.append(iter(years))
            year_rows = []
            for name in names:
                iters.append(iter(data[name]))
            filename = self.data_out(basename + '.csv')
            print('Writing data to ' + filename)
            with open(filename, 'w') as f:
                w = self.csv_writer(f)
                w.writerow(['year'] + names)
                for k, year in enumerate(years):
                    row = [next(i) for i in iters]
                    w.writerow(row)
                    year_rows.append(np.array(row[1:]))

            # Plot same data but as fraction
            for k, row in enumerate(year_rows):
                if np.sum(row) > 0:
                    year_rows[k] = row / np.sum(row)
            filename = self.data_out(basename + '_frac.csv')
            print('Writing data to ' + filename)
            with open(filename, 'w') as f:
                w = self.csv_writer(f)
                w.writerow(['year'] + names)
                for k, year in enumerate(years):
                    w.writerow([year] + list(year_rows[k] * 100))

            # Plot same data but as cumulative fractions
            for k, row in enumerate(year_rows):
                offset = 0
                for i, x in enumerate(row):
                    year_rows[k][i] += offset
                    offset += x
            filename = self.data_out(basename + '_cumfrac.csv')
            print('Writing data to ' + filename)
            with open(filename, 'w') as f:
                w = self.csv_writer(f)
                w.writerow(['year'] + names)
                for k, year in enumerate(years):
                    w.writerow([year] + list(year_rows[k]))
import base
import scrapper
import msvcrt as m

base.clear()

print("First step")
print("Create connection to database")
print("Make sure u have mysql database, user with have privileges to it and he using plugin mysql_native_password")
print("")
base.raw_print("Any key to continue ")
m.getch()

database = base.connect()

if base.create_schema(database):
    print("Operation end successfully")
    print("")
    print("Start webscraping")
    
    xx = "https://www.zalando.pl/odziez-damska"
    xy = "https://www.zalando.pl/odziez-meska/"

    scrapper.scrap(xx, "kobiety", database)
    scrapper.scrap(xy, "mezczyzni", database)
    
    
else:
    print("Error occur during creating schema")

 def _run(self):
     with base.connect() as con:
         c = con.cursor()
         #
         # 1. Create files showing the physical locations of positions with
         #    epdata, no epdata and no mutations (only positions with a
         #    navab equivalent).
         #
         # Get all positions with mutations (no exac)
         q = 'select distinct idx from report where pub != "exac"'
         mutations = set()
         for row in c.execute(q):
             mutations.add(row['idx'])
         # Get all positions with epdata
         epdata = set()
         for row in c.execute('select distinct idx from epdata'):
             epdata.add(row['idx'])
         # Get NavAb-SCN5A translation and vice versa
         navab2scn5a = {}
         for row in c.execute('select * from navab_to_scn5a'):
             navab2scn5a[row['navab']] = row['scn5a']
         # Write locations of SCN5A mutations, where known
         file1 = self.data_out('navab-locations-epdata.csv')
         file2 = self.data_out('navab-locations-mutation.csv')
         file3 = self.data_out('navab-locations-free.csv')
         try:
             f1 = open(file1, 'w')
             f2 = open(file2, 'w')
             f3 = open(file3, 'w')
             c1 = self.csv_writer(f1)
             c2 = self.csv_writer(f2)
             c3 = self.csv_writer(f3)
             data = ['idx', 'x', 'y', 'z', 'r', 't']
             c1.writerow(data)
             c2.writerow(data)
             c3.writerow(data)
             print('Writing data to ' + file1)
             print('            and ' + file2)
             print('            and ' + file3)
             for row in c.execute('select * from navab_locations'):
                 # Get scn5a equivalent
                 try:
                     idx = navab2scn5a[row['key']]
                 except KeyError:
                     # Skip if no scn5a equivalent is known
                     continue
                 # Store data
                 data = [
                     idx, row['x'], row['y'], row['z'], row['r'], row['t']
                 ]
                 if idx in epdata:
                     c1.writerow(data)
                 elif idx in mutations:
                     c2.writerow(data)
                 else:
                     c3.writerow(data)
         finally:
             f1.close()
             f2.close()
             f3.close()
         #
         # 2. Create a file with the positions of mutations and the
         #    associated dvi and dva
         #
         # Get midpoint shifts
         q = 'select * from epdata_filtered'
         q += ' where dva is not null and dvi is not null'
         names = {}
         dvi = {}
         dva = {}
         for row in c.execute(q):
             names[row['idx']] = row['old'] + str(row['idx']) + row['new']
             dvi[row['idx']] = row['dvi']
             dva[row['idx']] = row['dva']
         # Write file with distance to pore and midpoint shifts
         file1 = self.data_out('voltage-shift-navab-locations.csv')
         print('Writing data to ' + file1)
         with open(file1, 'w') as f1:
             c1 = self.csv_writer(f1)
             c1.writerow([
                 'name',
                 'r',
                 'dva',
                 'dvi',
                 'dva_abs',
                 'dvi_abs',
                 'sum_abs',
                 'window',
             ])
             for row in c.execute('select * from navab_locations'):
                 # Get scn5a equivalent or skip
                 try:
                     idx = navab2scn5a[row['key']]
                 except KeyError:
                     continue
                 # Get epdata or skip
                 try:
                     name = names[idx]
                     da = dva[idx]
                     di = dvi[idx]
                 except KeyError:
                     continue
                 c1.writerow([
                     name,
                     row['r'],
                     da,
                     di,
                     abs(di),
                     abs(da),
                     abs(di) + abs(di),
                     di - da,
                 ])
         #
         # 3. Create a file with the translated NavAb positions, grouped by
         #    SCN5A region
         #
         # Get regions, scn5a idx to region mapping
         regions = []
         scn5a_regions = {}
         for row in c.execute('select * from region order by start'):
             regions.append(row['name'])
             for idx in range(row['start'], 1 + row['end']):
                 scn5a_regions[idx] = row['name']
         # Create files
         path = self.data_out('navab_location_regions')
         if not os.path.isdir(path):
             os.makedirs(path)
         files = []
         csvs = {}
         try:
             # Open file per region
             for region in regions:
                 filename = region.replace(' ', '-')
                 filename = os.path.join(path, region + '.csv')
                 print('Writing data to ' + filename)
                 f = open(filename, 'w')
                 files.append(f)
                 csv = self.csv_writer(f)
                 csvs[region] = csv
                 csv.writerow(['idx', 'x', 'y', 'z'])
             # Get and write data
             q = 'select * from navab_locations order by key'
             for row in c.execute(q):
                 # Get scn5a equivalent or skip
                 try:
                     idx = navab2scn5a[row['key']]
                 except KeyError:
                     continue
                 # Write to correct file
                 csv = csvs[scn5a_regions[idx]]
                 csv.writerow([idx, row['x'], row['y'], row['z']])
         finally:
             for f in files:
                 f.close()
示例#25
0
    def _run(self):
        with base.connect() as con:
            c = con.cursor()

            # Change selecting queries
            conditions = [
                'zero > 0',
                'act > 0',
                'inact > 0',
                'late > 0',
            ]

            # Acids
            acids = [
                str(r['key'])
                for r in c.execute('select key from acid order by rowid')
            ]

            # Number of changes per acid_from
            counts_fr = []
            for acid in acids:
                q = 'select count(idx) from ('
                q += ' select idx, sum(zero) as zero, sum(act) as act,'
                q += ' sum(inact) as inact, sum(late) as late'
                q += ' from epdata_annotated'
                q += ' where old = "' + acid + '"'
                counts = []
                for condition in conditions:
                    qc = q + ' and ' + condition + ' group by idx, new)'
                    counts.append(c.execute(qc).fetchone()[0])
                counts_fr.append(counts)
            # Write files
            filename = self.data_out('acid-changes-1-from.csv')
            print('Writing data to ' + filename)
            with open(filename, 'w') as f:
                w = self.csv_writer(f)
                w.writerow(['xaxis', 'xlabel', 'zero', 'act', 'inact', 'late'])
                for k, name in enumerate(acids):
                    w.writerow([k + 1, name] + counts_fr[k])
            relative_fr = np.array(counts_fr, dtype=float)
            relative_fr /= (np.sum(relative_fr, axis=1).reshape(20, 1) + 1e-12)
            filename = self.data_out('acid-changes-relative-1-from.csv')
            print('Writing data to ' + filename)
            with open(filename, 'w') as f:
                w = self.csv_writer(f)
                w.writerow(['xaxis', 'xlabel', 'zero', 'act', 'inact', 'late'])
                for k, name in enumerate(acids):
                    w.writerow([k + 1, name] + list(relative_fr[k]))

            # Number of changes per acid_to
            counts_to = []
            for acid in acids:
                q = 'select count(idx) from ('
                q += ' select idx, sum(zero) as zero, sum(act) as act,'
                q += ' sum(inact) as inact, sum(late) as late'
                q += ' from epdata_annotated'
                q += ' where new = "' + acid + '"'
                counts = []
                for condition in conditions:
                    qc = q + ' and ' + condition + ' group by idx, new)'
                    counts.append(c.execute(qc).fetchone()[0])
                counts_to.append(counts)
            # Write file
            filename = self.data_out('acid-changes-2-to.csv')
            print('Writing data to ' + filename)
            with open(filename, 'w') as f:
                w = self.csv_writer(f)
                w.writerow(['xaxis', 'xlabel', 'zero', 'act', 'inact', 'late'])
                for k, name in enumerate(acids):
                    w.writerow([k + 1, name] + counts_to[k])
            relative_to = np.array(counts_to, dtype=float)
            relative_to /= (np.sum(relative_to, axis=1).reshape(20, 1) + 1e-12)
            filename = self.data_out('acid-changes-relative-2-to.csv')
            print('Writing data to ' + filename)
            with open(filename, 'w') as f:
                w = self.csv_writer(f)
                w.writerow(['xaxis', 'xlabel', 'zero', 'act', 'inact', 'late'])
                for k, name in enumerate(acids):
                    w.writerow([k + 1, name] + list(relative_to[k]))

            # Write file combined data
            filename = self.data_out('acid-changes-3-combined.csv')
            print('Writing data to ' + filename)
            with open(filename, 'w') as f:
                w = self.csv_writer(f)
                w.writerow(['xaxis', 'xlabel', 'zero', 'act', 'inact', 'late'])
                for k, name in enumerate(acids):
                    counts = []
                    for i in range(len(conditions)):
                        counts.append(counts_fr[k][i] + counts_to[k][i])
                    w.writerow([k + 1, name] + counts)
            relative_sum = np.array(counts_to, dtype=float)
            relative_sum += np.array(counts_fr, dtype=float)
            relative_sum /= (np.sum(relative_sum, axis=1).reshape(20, 1) +
                             1e-12)
            filename = self.data_out('acid-changes-relative-3-combined.csv')
            print('Writing data to ' + filename)
            with open(filename, 'w') as f:
                w = self.csv_writer(f)
                w.writerow(['xaxis', 'xlabel', 'zero', 'act', 'inact', 'late'])
                for k, name in enumerate(acids):
                    w.writerow([k + 1, name] + list(relative_sum[k]))
示例#26
0
    def _run(self):
        with base.connect() as con:
            c = con.cursor()

            # Count positions
            n = next(c.execute('select count(idx) from scn5a;'))[0]
            print('Positions: ' + str(n))
            # Count articles
            n = next(c.execute('select count(key) from publication;'))[0]
            print('Articles: ' + str(n))
            # Count journals
            n = next(c.execute('select count(key) from journal;'))[0]
            print('Journals: ' + str(n))

            # Count mutations
            # Note: `mutation` lists only unique (old, idx, new) pairs, not
            # reports of mutations!
            n = next(c.execute('select count(old) from mutation;'))[0]
            print('Mutations               : ' + str(n))
            # Count unique mutation positions
            q = 'select count(idx) from '\
                '(select distinct idx from mutation);'
            n = next(c.execute(q))[0]
            print('Positions with mutations: ' + str(n))

            # Count mutations (no exac)
            q = 'select count(idx) from '\
                '(select distinct idx, new from report where pub != "exac");'
            n = next(c.execute(q))[0]
            print('Mutations (no exac)               : ' + str(n))
            # Count unique mutation positions (no exac)
            q = 'select count(distinct idx) from report where pub != "exac"'
            n = next(c.execute(q))[0]
            print('Positions with mutations (no exac): ' + str(n))

            # EP Data measurements
            n = next(c.execute('select count(idx) from epdata;'))[0]
            print('EP measurements: ' + str(n))

            # Mutations with measured EP
            q = 'select count(idx) from epdata_outcomes'
            n = next(c.execute(q))[0]
            print('Mutations with epdata: ' + str(n))
            # Positions with measured EP
            q = 'select count(distinct idx) from epdata_outcomes'
            n = next(c.execute(q))[0]
            print('Positions with epdata: ' + str(n))

            # Mutations with any change to EP
            q = 'select count(idx) from epdata_outcomes'
            q += ' where zero>0 or act>0 or inact>0 or late>0'
            n = next(c.execute(q))[0]
            print('Mutations with change to epdata: ' + str(n))
            q = 'select count(distinct idx) from epdata_outcomes'
            q += ' where zero>0 or act>0 or inact>0 or late>0'
            n = next(c.execute(q))[0]
            print('Positions with change to epdata: ' + str(n))
            q = 'select distinct idx from epdata_outcomes'
            q += ' where zero>0 or act>0 or inact>0 or late>0'
            # Mutations without any change to EP
            q = 'select count(idx) from epdata_outcomes'
            q += ' where zero<1 and act<1 and inact<1 and late<1'
            n = next(c.execute(q))[0]
            print('Mutations with no change to epdata: ' + str(n))
            q = 'select count(distinct idx) from epdata_outcomes'
            q += ' where zero<1 and act<1 and inact<1 and late<1'
            n = next(c.execute(q))[0]
            print('Positions with no change to epdata: ' + str(n))

            # Mutations causing zero current
            q = 'select count(idx) from ' \
                '(select distinct idx, new from epdata_outcomes where zero>0);'
            n = next(c.execute(q))[0]
            print('Mutations with zero=1: ' + str(n))
            # Positions causing zero current
            q = 'select count(idx) from ' \
                '(select distinct idx from epdata_outcomes where zero>0);'
            n = next(c.execute(q))[0]
            print('Positions with zero=1: ' + str(n))

            # Mutations affecting activation
            q = 'select count(idx) from ' \
                '(select distinct idx, new from epdata_outcomes where act>0);'
            n = next(c.execute(q))[0]
            print('Mutations with act=1: ' + str(n))
            # Positions affecting activation
            q = 'select count(idx) from ' \
                '(select distinct idx from epdata_outcomes where act>0);'
            n = next(c.execute(q))[0]
            print('Positions with act=1: ' + str(n))

            # Mutations affecting inactivation
            q = 'select count(idx) from ' \
                '(select distinct idx, new from epdata_outcomes where inact>0);'
            n = next(c.execute(q))[0]
            print('Mutations with inact=1: ' + str(n))
            # Positions affecting inactivation
            q = 'select count(idx) from ' \
                '(select distinct idx from epdata_outcomes where inact>0);'
            n = next(c.execute(q))[0]
            print('Positions with inact=1: ' + str(n))

            # Mutations affecting late
            q = 'select count(idx) from ' \
                '(select distinct idx, new from epdata_outcomes where late>0);'
            n = next(c.execute(q))[0]
            print('Mutations with late=1: ' + str(n))
            # Positions affecting late
            q = 'select count(idx) from ' \
                '(select distinct idx from epdata_outcomes where late>0);'
            n = next(c.execute(q))[0]
            print('Positions with late=1: ' + str(n))
示例#27
0
def connect(database, username=None, password=None, environment=None):
    """Creates a database connection.

    """
    return Connection(base.connect(database, username, password, environment))
示例#28
0
 def _run(self):
     with base.connect() as con:
         c = con.cursor()
         # Get positions (may have gaps!)
         idx = []
         for r in c.execute('select idx from scn5a order by idx'):
             idx.append(r[0])
         idx = np.array(idx)
         # Get mutations for each position
         q = 'select distinct idx, new from report where pub != "exac"'
         mut = np.zeros(idx.shape)
         for r in c.execute(q):
             mut[r[0] - 1] = 1  # Positions start at 1
         # Get Human-squid-eel and domain alignment score
         hse = np.zeros(idx.shape, dtype=float)
         dom = np.zeros(idx.shape, dtype=float)
         q = 'select idx, hse, dom from conservedness order by idx'
         for k, r in enumerate(c.execute(q)):
             assert (r[0] - 1 == k)  # Score should be stored for each idx
             hse[k] = r[1]
             dom[k] = r[2]
         #
         # 1. Overal mean mutation count and hse/dom scores
         #    (Text output only)
         #
         hse_mean = np.mean(hse)
         hse_stdd = np.std(hse)
         dom_mean = np.mean(dom)
         dom_stdd = np.std(dom)
         print('Mean hse: ' + str(hse_mean) + ', std: ' + str(hse_stdd))
         print('Mean dom: ' + str(dom_mean) + ', std: ' + str(dom_stdd))
         #
         # 2. Position and dom/hse scores for positions with and without
         #    mutations.
         #    (Text output only)
         #
         idx_idx = idx[mut > 0]
         hse_idx = hse[mut > 0]
         dom_idx = dom[mut > 0]
         idx_neg = idx[mut == 0]
         hse_neg = hse[mut == 0]
         dom_neg = dom[mut == 0]
         hse_idx_mean = np.mean(hse_idx)
         hse_idx_stdd = np.std(hse_idx)
         dom_idx_mean = np.mean(dom_idx)
         dom_idx_stdd = np.std(dom_idx)
         hse_neg_mean = np.mean(hse_neg)
         hse_neg_stdd = np.std(hse_neg)
         dom_neg_mean = np.mean(dom_neg)
         dom_neg_stdd = np.std(dom_neg)
         print('HSE score:')
         print('  Mean, with mutations: ' + str(hse_idx_mean) + ', std: ' +
               str(hse_idx_stdd))
         print('  Mean, no mutations  : ' + str(hse_neg_mean) + ', std: ' +
               str(hse_neg_stdd))
         print('DOM score:')
         print('  Mean, with mutations: ' + str(dom_idx_mean) + ', std: ' +
               str(dom_idx_stdd))
         print('  Mean, no mutations  : ' + str(dom_neg_mean) + ', std: ' +
               str(dom_neg_stdd))
         #
         # 3. HSE and DOM score for positions with and without mutations
         #    (For use in a box-plot)
         #
         basename = 'score-with-mutations'
         filename = self.data_out(basename + '.txt')
         print('Writing info to ' + filename)
         with open(filename, 'w') as f:
             f.write('Scores for positions with mutations (idx, hse, dom)')
         filename = self.data_out(basename + '.csv')
         print('Writing data to ' + filename)
         with open(filename, 'w') as f:
             c = self.csv_writer(f)
             c.writerow(['position', 'hse-score', 'dom-score'])
             h = iter(hse_idx)
             d = iter(dom_idx)
             for p in idx_idx:
                 c.writerow([p, next(h), next(d)])
         basename = 'score-without-mutations'
         filename = self.data_out(basename + '.txt')
         print('Writing info to ' + filename)
         with open(filename, 'w') as f:
             f.write(
                 'Scores for positions without mutations (idx, hse, dom)')
         filename = self.data_out(basename + '.csv')
         print('Writing data to ' + filename)
         with open(filename, 'w') as f:
             c = self.csv_writer(f)
             c.writerow(['position', 'hse-score', 'dom-score'])
             h = iter(hse_neg)
             d = iter(dom_neg)
             for p in idx_neg:
                 c.writerow([p, next(h), next(d)])
         # Write labels used to create box plots
         basename = 'score-with-without-mutations-labels'
         filename = self.data_out(basename + '.csv')
         print('Writing label info to ' + filename)
         with open(filename, 'w') as f:
             c = self.csv_writer(f)
             c.writerow(['HSE-'])
             c.writerow(['HSE+'])
             c.writerow(['DOM-'])
             c.writerow(['DOM+'])
 def _run(self):
     print('Calculating...')
     # Get region names and positions
     position_count = 0  # Number of positions
     region_name = []  # Region name
     region_first = []  # First position in region
     region_final = []  # Final position in region
     region_size = []  # Number of positions in region
     region_map = {}  # Maps positions to regions
     region_regtype = []  # Region "regtype" values
     with base.connect() as con:
         c = con.cursor()
         q = 'select name, start, end, regtype from region order by start'
         for r in c.execute(q):
             k = len(region_name)
             region_name.append(r[0])
             region_first.append(r[1])
             region_final.append(r[2])
             region_size.append(1 + r[2] - r[1])
             for p in range(r[1], r[2] + 1):
                 region_map[p] = k
             region_regtype.append(r[3])
         position_count = r[2]
         # Number of regions
         region_count = len(region_name)
         # Number of mutations per region
         region_mutations = [0] * region_count
         mutation_count = 0
         q = 'select distinct idx, new from report where pub != "exac"'
         for r in c.execute(q):
             region_mutations[region_map[r[0]]] += 1
             mutation_count += 1
         # Get average density
         mutation_density = float(mutation_count) / position_count
         # Get mutation density and relative density, per region
         region_density = [0] * region_count
         region_reldens = [0] * region_count
         for k, count in enumerate(region_mutations):
             density = float(count) / region_size[k]
             reldens = density - mutation_density if density != 0 else 0
             region_density[k] = density
             region_reldens[k] = reldens
         # Region types
         regtype_name = []
         regtype_map = {}
         q = 'select name from regtype order by rowid'
         for k, r in enumerate(c.execute(q)):
             regtype_name.append(r[0])
             regtype_map[r[0]] = k
         # Count number of regtypes
         regtype_count = len(regtype_name)
         # Count mutations per regtype
         regtype_mutations = [0] * regtype_count
         # Count positions per regtype
         regtype_size = [0] * regtype_count
         for k, t in enumerate(region_regtype):
             t = regtype_map[t]
             regtype_mutations[t] += region_mutations[k]
             regtype_size[t] += region_size[k]
         regtype_density = [0] * regtype_count
         regtype_reldens = [0] * regtype_count
         for k, count in enumerate(regtype_size):
             density = float(regtype_mutations[k]) / count
             reldens = density - mutation_density if density != 0 else 0
             regtype_density[k] = density
             regtype_reldens[k] = reldens
         #
         # Write results
         #
         # 1. Global mutation density
         basename = 'mutation-density-global'
         filename = self.data_out(basename + '.txt')
         print('Writing info to ' + filename)
         with open(filename, 'w') as f:
             f.write('Number of positions in scn5a, mutations found, global'
                     ' mutation density.')
         filename = self.data_out(basename + '.csv')
         print('Writing data to ' + filename)
         with open(filename, 'w') as f:
             w = self.csv_writer(f)
             w.writerow(
                 ['total_positions', 'total_mutations', 'total_density'])
             w.writerow([position_count, mutation_count, mutation_density])
         # 2. Mutation density in regions
         basename = 'mutation-density-regions'
         filename = self.data_out(basename + '.txt')
         print('Writing info to ' + filename)
         with open(filename, 'w') as f:
             f.write(
                 'Region name, region start, region end, region size,'
                 ' mutation density in region and mutation density relative'
                 ' to global. Finally, density and relative density in'
                 ' percentages. Relative density is calculated as'
                 ' (density in segment) / total density. Except where'
                 ' (density in segment) == 0, there, relative density is'
                 ' set to 0')
         filename = self.data_out(basename + '.csv')
         print('Writing data to ' + filename)
         with open(filename, 'w') as f:
             w = self.csv_writer(f)
             w.writerow([
                 'name',
                 'idx',
                 'first',
                 'final',
                 'count',
                 'mutations',
                 'density',
                 'reldens',
                 'pdensity',
                 'preldens',
             ])
             for k, name in enumerate(region_name):
                 w.writerow([
                     name,
                     k,
                     region_first[k],
                     region_final[k],
                     region_size[k],
                     region_mutations[k],
                     region_density[k],
                     region_reldens[k],
                     region_density[k] * 100,
                     region_reldens[k] * 100,
                 ])
         # 3. Mutation density in region types
         basename = 'mutation-density-regtypes'
         filename = self.data_out(basename + '.txt')
         print('Writing info to ' + filename)
         with open(filename, 'w') as f:
             f.write('Regtype name, regtype size, mutation density and'
                     ' relative mutation density.')
         filename = self.data_out(basename + '.csv')
         print('Writing data to ' + filename)
         with open(filename, 'w') as f:
             w = self.csv_writer(f)
             w.writerow([
                 'name',
                 'idx',
                 'mutations',
                 'density',
                 'reldens',
                 'pdensity',
                 'preldens',
             ])
             for k, name in enumerate(regtype_name):
                 w.writerow([
                     name,
                     k,
                     regtype_mutations[k],
                     regtype_density[k],
                     regtype_reldens[k],
                     regtype_density[k] * 100,
                     regtype_reldens[k] * 100,
                 ])
示例#30
0
 def gather(self, q, fname1, fname2):
     # Query db
     data = []
     with base.connect() as con:
         c = con.cursor()
         for row in c.execute(q):
             data.append((
                 row['pub'],
                 row['va'],
                 #row['stda'],
                 2 * row['stda'],
                 row['vi'],
                 #row['stdi'],
                 2 * row['stdi'],
                 row['va'] - row['vi'],
             ))
     # Write file
     fname1 = self.data_out(fname1)
     print('Writing to ' + fname1)
     with open(fname1, 'w') as f:
         csv = self.csv_writer(f)
         csv.writerow(['pub', 'va', '+-', 'vi', '+-', 'dv'])
         for row in data:
             csv.writerow(row)
     print('Collected data from ' + str(len(data)) + ' reports.')
     #
     # Correct with linear regression
     #
     print('Subtracting linear regression...')
     # Gather data
     va = []
     vi = []
     for row in data:
         va.append(row[1])
         vi.append(row[3])
     va = np.array(va)
     vi = np.array(vi)
     # Fit line
     b, a = np.polyfit(va, vi, 1)
     if DEBUG:
         import matplotlib.pyplot as pl
         pl.figure()
         pl.plot(va, vi, 'o')
         x = np.linspace(np.min(va) - 10, np.max(va) + 10, 1000)
         y = a + b * x
         pl.plot(x, y)
     # Subtract
     if DEBUG:
         pl.figure()
         pl.plot(va, vi - (a + b * va), 'o')
         pl.show()
     print('Coefficients: ' + str(a) + ', ' + str(b))
     # Write file
     fname2 = self.data_out(fname2)
     print('Writing to ' + fname2)
     with open(fname2, 'w') as f:
         csv = self.csv_writer(f)
         csv.writerow(['pub', 'va', '+-', 'vic', '+-'])
         for k, row in enumerate(data):
             row = list(row[:-1])
             row[3] = row[3] - (a + b * row[1])
             csv.writerow(row)
     # Get Pearson correlation coefficient
     print('Pearson correlation coefficient: ' +
           str(np.corrcoef(va, vi)[1, 0]))
     print('Done')
示例#31
0
    def _run(self):
        # Collect tex references
        refs = {}
        with base.connect() as con:
            c = con.cursor()
            q = 'select key, tex from publication_tex'
            for k, row in enumerate(c.execute(q)):
                refs[row['key']] = row['tex']
        # Create table file
        filename = self.data_out('epdata-table2.tex')
        fields = [
            'pub',
            'old',
            'idx',
            'new',
            'dva',
            'dvi',
            'zero',
            'act',
            'inact',
            'late',
            'sequence',
            'cell',
            'beta1',
        ]

        # Sequence formatting
        def seq(s):
            if s == 'astar':
                return 'a*'
            elif s == 'bstar':
                return 'b*'
            elif s == 'achen':
                return 'a**'
            elif s is None:
                return ''
            return s

        def cell(c):
            if c in ['HEK', 'CHO']:
                return c
            elif c == 'Mouse myocyte':
                return 'MM'
            elif c == 'Oocyte':
                return 'Ooc.'
            elif c is None:
                return ''
            return c

        def yesno(x):
            return 'yes' if x == 1 else ('no' if x == -1 else '')

        # Create table
        with open(filename, 'w') as f:
            # Header
            size = 'tiny'
            f.write('\\begin{' + size + '}\n')
            f.write('\\startrowcolors\n')
            f.write('\\begin{longtable}{p{4cm}|l|llll|ll|lll}\n')
            f.write('\\caption{\\label{tab:epdata}EP Data} \\\\\n')
            f.write('\\hline\n')
            f.write('Publication')
            f.write(' & Mutation')
            f.write(' & Act. & Inact. & Late & Zero')
            f.write(' & ${\Delta}V_a$ & ${\Delta}V_i$')
            f.write(' & Cell & $\\alpha$ & $\\beta1$ \\\\\n')
            f.write('\\hline\n')
            f.write('\\endfirsthead')
            f.write('\\hline\n')
            f.write('\\rowcolor{white}\n')
            f.write('Publication')
            f.write(' & Mutation')
            f.write(' & Act. & Inact. & Late & Zero')
            f.write(' & ${\Delta}V_a$ & ${\Delta}V_i$')
            f.write(' & Cell & $\\alpha$ & $\\beta1$ \\\\\n')
            f.write('\\hline\n')
            f.write('\\endhead\n')
            f.write('\\hline\n')
            f.write('\\endfoot\n')
            # Body
            form = '{:.3g}'
            with base.connect() as con:
                c = con.cursor()
                q = 'select ' + ', '.join(fields) + ' from epdata'
                q += ' order by idx, new'
                for k, row in enumerate(c.execute(q)):
                    x = []
                    x.append(row['pub'] + '\\cite{' + refs[row['pub']] + '}')
                    x.append(row['old'] + str(row['idx']) + row['new'])
                    x.append(yesno(row['act']))
                    x.append(yesno(row['inact']))
                    x.append(yesno(row['late']))
                    x.append(yesno(row['zero']))
                    x.append(
                        '' if row['dva'] is None else form.format(row['dva']))
                    x.append(
                        '' if row['dvi'] is None else form.format(row['dvi']))
                    x.append(cell(row['cell']))
                    x.append(seq(row['sequence']))
                    x.append(row['beta1'])
                    f.write(' & '.join(x) + ' \\\\\n')
            # Footer
            f.write('\\end{longtable}\n')
            f.write('\\end{' + size + '}\n')
示例#32
0
    def _run(self):
        with base.connect() as con:
            c = con.cursor()

            # Mutations with multiple epdata reports
            names = []
            parts = []
            counts = []
            q = 'select old, idx, new, n from ('
            q += 'select *, count(idx) as n from epdata group by idx, new'
            q += ') where n > 1 order by n desc, idx, new'
            for row in c.execute(q):
                names.append(row['old'] + str(row['idx']) + row['new'])
                parts.append((row['old'], row['idx'], row['new']))
                counts.append(row['n'])

            # Get data for mutations with multiple reports
            fields = [
                'act', 'inact', 'late', 'zero', 'sequence', 'cell', 'beta1',
                'pub'
            ]
            data = {}
            q = 'select * from epdata'
            q += ' where old=? and idx=? and new=?'
            for k, name in enumerate(names):
                d = {}
                for field in fields:
                    d[field] = []
                for row in c.execute(q, parts[k]):
                    for field in fields:
                        d[field].append(row[field])
                data[name] = d

            # Get list of mutations with inconsistencies
            ters = ['act', 'inact', 'late']
            bins = ['zero']
            issues = []
            for k, name in enumerate(names):
                d = data[name]
                for t in ters:
                    if -1 in d[t] and 1 in d[t]:
                        issues.append(name)
                        break
                else:  # If didn't break
                    for b in bins:
                        if 0 in d[t] and 1 in d[t]:
                            issues.append(name)
                            break
            # Show output
            def pront(name):
                print(name)
                d = data[name]
                for k in range(len(d['act'])):
                    for f in fields:
                        print(d[f][k], end=' ')
                    print('')

            for name in names:
                if name not in issues:
                    pront(name)
            print()
            print('=' * 40)
            print('Mutations with doubles: ' + str(len(counts)))
            print('Mutations with issues: ' + str(len(issues)))
            print('=' * 40)
            print()
            for name in issues:
                pront(name)
    def _run(self):
        with base.connect() as con:
            c = con.cursor()

            #
            # Global mutation density
            #

            # Get total number of positions
            q = 'select count(idx) from scn5a'
            total_length = c.execute(q).fetchone()[0]

            # Get total number of positions with a (non exac) mutation
            q = 'select count(idx) from mutation_no_exac'
            total_count = c.execute(q).fetchone()[0]

            # Global mutation density
            global_density = total_count / total_length


            #
            # Mutation density per domain
            #

            # Domain names
            domain_names = [
                r['name'] for r in c.execute('select name from domain')]

            # Domain lengths
            domain_lengths = []
            for name in domain_names:
                q = 'select sum(length) from region'
                q += ' where domain = "' + name + '"'
                domain_lengths.append(c.execute(q).fetchone()[0])

            # Number of mutations per domain
            domain_count = []
            for name in domain_names:
                q = 'select count(idx)'
                q += ' from mutation_no_exac_annotated'
                q += ' where domain = "' + name + '"'
                domain_count.append(c.execute(q).fetchone()[0])

            # Mutation density per domain
            domain_density = np.array(domain_count) / np.array(domain_lengths)

            # Relative mutation density per domain
            domain_reldens = domain_density - global_density

            # Write file
            filename = self.data_out('density-1-per-domain.csv')
            print('Writing data to ' + filename)
            with open(filename, 'w') as f:
                w = self.csv_writer(f)
                w.writerow([
                    'xaxis',
                    'xlabel',
                    'density',
                    'relative-density',
                ])
                for k, name in enumerate(domain_names):
                    w.writerow([
                        k + 1,
                        name,
                        domain_density[k],
                        domain_reldens[k],
                    ])

            #
            # Mutation density per region type
            #

            # Region type names
            regtype_names = [
                r['name'] for r in c.execute('select name from regtype')]

            # Regtype lengths
            regtype_lengths = []
            for name in regtype_names:
                q = 'select sum(length) from region'
                q += ' where regtype = "' + name + '"'
                regtype_lengths.append(c.execute(q).fetchone()[0])

            # Number of mutations per regtype
            regtype_count = []
            for name in regtype_names:
                q = 'select count(idx)'
                q += ' from mutation_no_exac_annotated'
                q += ' where regtype = "' + name + '"'
                regtype_count.append(c.execute(q).fetchone()[0])

            # Mutation density per regtype
            regtype_density = np.array(regtype_count)/np.array(regtype_lengths)

            # Relative mutation density per regtype
            regtype_reldens = regtype_density - global_density

            # Write file
            filename = self.data_out('density-2-per-regtype.csv')
            print('Writing data to ' + filename)
            with open(filename, 'w') as f:
                w = self.csv_writer(f)
                w.writerow([
                    'xaxis',
                    'xlabel',
                    'density',
                    'relative-density',
                ])
                for k, name in enumerate(regtype_names):
                    w.writerow([
                        k + 1,
                        name,
                        regtype_density[k],
                        regtype_reldens[k],
                    ])


            #
            # Mutation density per region
            #

            # Region names
            q = 'select name from region order by start'
            region_names = [r['name'] for r in c.execute(q)]

            # Region lengths
            region_lengths = []
            for name in region_names:
                q = 'select length from region'
                q += ' where name = "' + name + '"'
                region_lengths.append(c.execute(q).fetchone()[0])

            # Number of mutations per region
            region_count = []
            for name in region_names:
                q = 'select count(idx)'
                q += ' from mutation_no_exac_annotated'
                q += ' where region = "' + name + '"'
                region_count.append(c.execute(q).fetchone()[0])

            # Mutation density per region
            region_density = np.array(region_count) / np.array(region_lengths)

            # Relative mutation density per regtype
            region_reldens = region_density - global_density

            # Write file
            filename = self.data_out('density-3-per-region.csv')
            print('Writing data to ' + filename)
            with open(filename, 'w') as f:
                w = self.csv_writer(f)
                w.writerow([
                    'xaxis',
                    'xlabel',
                    'density',
                    'relative-density',
                ])
                for k, name in enumerate(region_names):
                    w.writerow([
                        k + 1,
                        name,
                        region_density[k],
                        region_reldens[k],
                    ])
    def _run(self):
        with base.connect() as con:
            c = con.cursor()

            # Get acid properties
            properties = [
                'average_residue_mass',
                'percent_buried_residues',
                'v_waals',
                'polarity_ranking',
                'charge',
                'hydrophobicity',
                'helix_propensity',
            ]
            pvalues = {}
            for row in c.execute('select * from acid'):
                vs = {}
                for p in properties:
                    vs[p] = row[p]
                pvalues[row['key']] = vs

            # Gather delta-properties, per outcome
            # Note that epdata_outcomes contains each mutation only once
            outcomes = ['zero', 'act', 'inact', 'late']
            ovalues = {}
            for o in outcomes:
                ovalues[o] = {}
                for p in properties:
                    ovalues[o][p] = []
            for row in c.execute('select * from epdata_outcomes'):
                old, new = row['old'], row['new']
                for o in outcomes:
                    if row[o] > 0:
                        for p in properties:
                            delta = pvalues[new][p] - pvalues[old][p]
                            ovalues[o][p].append(delta)

            # Add unchanged outcome
            outcomes += ['unchanged']
            ovalues['unchanged'] = {}
            for p in properties:
                ovalues['unchanged'][p] = []
            q = 'select * from epdata where'
            q += ' (act < 1 and inact < 1 and zero < 1 and late < 1)'
            for row in c.execute(q):
                old, new = row['old'], row['new']
                for p in properties:
                    delta = pvalues[new][p] - pvalues[old][p]
                    ovalues['unchanged'][p].append(delta)

            # Add changed outcome
            outcomes += ['changed']
            ovalues['changed'] = {}
            for p in properties:
                ovalues['changed'][p] = []
            q = 'select * from epdata where'
            q += ' (act > 0 or inact > 0 or zero > 0 or late > 0)'
            for row in c.execute(q):
                old, new = row['old'], row['new']
                for p in properties:
                    delta = pvalues[new][p] - pvalues[old][p]
                    ovalues['changed'][p].append(delta)

            # Store
            basename = 'deltas-'
            w = 0.3
            for k, o in enumerate(outcomes):
                filename = self.data_out(basename + str(1 + k) + '-' + o +
                                         '.csv')
                print('Writing ' + filename)
                with open(filename, 'w') as f:
                    c = self.csv_writer(f)
                    c.writerow(['x'] + properties)
                    n = len(ovalues[o][properties[0]])
                    iters = [iter(ovalues[o][p]) for p in properties]
                    for x in np.linspace(1 + k - w, 1 + k + w, n):
                        c.writerow([x] + [next(i) for i in iters])
            filename = self.data_out('deltas-labels.csv')
            print('Writing ' + filename)
            with open(filename, 'w') as f:
                f.write('x, label\n')
                for k, o in enumerate(outcomes):
                    f.write(str(k + 1) + ',' + o + '\n')

            #
            # Now run a test, for each property (so independent tests!).
            #
            # Data isn't normal, so we use a Kruskal Wallis test.
            #
            # In each test, test if we can reject the null hypothesis that the
            # results in each outcome group are from the same distribution.
            #
            # This gives us a pvalue. If the pvalue is low, we can say the
            # groups are different.
            #
            print('-' * 40)
            print('Comparing 5 outcomes (not including `changed`)')
            print('-' * 40)
            for p in properties:
                print(p)
                # Gather samples for each outcome, not including `changed`
                groups = []
                for o in outcomes[:-1]:
                    groups.append(np.array(ovalues[o][p]))
                # Perform test
                statistic, pvalue = sp.stats.f_oneway(*groups)
                print('Anova: ' + str(statistic) + ', ' + str(pvalue))
                statistic, pvalue = sp.stats.kruskal(*groups)
                print('Kruskal-Wallis: ' + str(statistic) + ', ' + str(pvalue))
            #
            # But... samples can be in multiple changed groups, so maybe
            # compare them individually with `unchanged` as well
            #
            print('-' * 40)
            print('Comparing 5 outcomes with `unchanged`')
            print('-' * 40)
            for p in properties:
                print(p)
                # Gather samples for each outcome
                groups = []
                for o in outcomes:
                    groups.append(np.array(ovalues[o][p]))
                # Perform tests
                for i, o in enumerate(outcomes):
                    print('Unchanged vs ' + o)
                    statistic, pvalue = sp.stats.kruskal(groups[4], groups[i])
                    print('Kruskal-Wallis: ' + str(statistic) + ', ' +
                          str(pvalue))
                print('- ' * 20)
示例#35
0
 def _run(self):
     with base.connect() as con:
         c = con.cursor()
         # Load gonnet scores
         print('Loading gonnet scores')
         scores = {}
         for row in c.execute('select * from gonnet_score'):
             scores[row['key1'] + row['key2']] = row['score']
             scores[row['key2'] + row['key1']] = row['score']
         # Load voltage shifts
         print('Loading voltage shift data')
         q = 'select * from epdata_filtered'
         q += ' where dva is not null and dvi is not null'
         mutations = []
         for row in c.execute(q):
             mutations.append([
                 int(scores[row['old'] + row['new']]),
                 float(row['dva']),
                 float(row['dvi']),
             ])
         # Create file relating the two
         filename = self.data_out('voltage-shift-gonnet-scores.csv')
         print('Writing data to ' + filename)
         with open(filename, 'w') as f:
             w = self.csv_writer(f)
             w.writerow([
                 'score',
                 'dva',
                 'dvi',
             ])
             for mutation in mutations:
                 w.writerow(mutation)
         #
         # Divide the score axis into bins, for each bin, calculate
         #   (1/bin_size) * sum( dva )
         # and
         #   (1/bin_size) * sum( abs(dva) )
         # then repeat with dvi
         #
         print('Calculating binned sum of squares')
         # Create bins
         lo = -5.5
         hi = 5.5
         bw = 1
         centers = np.arange(lo, hi + bw, bw)
         lower = centers - bw * 0.5
         upper = centers + bw * 0.5
         # Gather shifts in bins
         dva = [[] for x in centers]
         dvi = [[] for x in centers]
         for mutation in mutations:
             score, da, di = mutation
             i = np.where((score >= lower) * (score < upper))[0][0]
             dva[i].append(da)
             dvi[i].append(di)
         # Calculate stats and write to file
         dva = [np.array(x) if x else np.array([0]) for x in dva]
         dvi = [np.array(x) if x else np.array([0]) for x in dvi]
         print([np.mean(x) for x in dva])
         print([np.mean(x) for x in dvi])
         filename = self.data_out('voltage-shift-gonnet-scores-binned.csv')
         print('Writing data to ' + filename)
         with open(filename, 'w') as f:
             w = self.csv_writer(f)
             w.writerow([
                 'center',
                 'dva-mean',
                 'dvi-mean',
                 'dva-std',
                 'dvi-std',
                 'dva-abs-mean',
                 'dvi-abs-mean',
             ])
             data = [
                 centers,
                 [np.mean(x) for x in dva],
                 [np.mean(x) for x in dvi],
                 [np.std(x) for x in dva],
                 [np.std(x) for x in dvi],
                 [np.mean(np.abs(x)) for x in dva],
                 [np.mean(np.abs(x)) for x in dvi],
             ]
             data = np.array([np.array(x) for x in data]).transpose()
             for row in data:
                 w.writerow(row)
         print('Done')
示例#36
0
    def _run(self):
        # Collect tex references
        refs = {}
        with base.connect() as con:
            c = con.cursor()
            q = 'select key, tex from publication_tex'
            for k, row in enumerate(c.execute(q)):
                refs[row['key']] = row['tex']
        # Create table file
        filename = self.data_out('midpoint-table.tex')
        fields = [
            'pub',
            'va',
            'na',
            'stda',
            'vi',
            'ni',
            'stdi',
            'sequence',
            'cell',
            'beta1',
        ]

        # Sequence formatting
        def seq(s):
            if s == 'astar':
                return 'a*'
            elif s == 'bstar':
                return 'b*'
            elif s is None:
                return '?'
            return s

        # Create table
        with open(filename, 'w') as f:
            # Header
            size = 'footnotesize'
            f.write('\\begin{' + size + '}\n')
            f.write('\\startrowcolors\n')
            f.write('\\begin{longtable}{p{5cm}|lll|lll|lll}\n')
            f.write('\\caption{\\label{midpoints}Midpoints} \\\\\n')
            f.write('\\hline\n')
            f.write('Publication')
            f.write(' & $V_a$ & $\sigma_a$  & $n_a$')
            f.write(' & $V_i$ & $\sigma_i$  & $n_i$')
            f.write(' & Cell & $\\alpha$ & $\\beta1$ \\\\\n')
            f.write('\\hline\n')
            f.write('\\endfirsthead')
            f.write('\\hline\n')
            f.write('\\rowcolor{white}\n')
            f.write('Publication')
            f.write(' & $V_a$ & $\sigma_a$  & $n_a$')
            f.write(' & $V_i$ & $\sigma_i$  & $n_i$')
            f.write(' & Cell & $\\alpha$ & $\\beta1$ \\\\\n')
            f.write('\\hline\n')
            f.write('\\endhead\n')
            f.write('\\hline\n')
            f.write('\\endfoot\n')
            # Body
            form = '{:.3g}'
            with base.connect() as con:
                c = con.cursor()
                q = 'select ' + ', '.join(fields) + ' from midpoints_wt'
                for k, row in enumerate(c.execute(q)):
                    x = []
                    x.append('\\citet{' + refs[row['pub']] + '}')
                    if row['na'] != 0:
                        x.append(form.format(row['va']))
                        x.append(form.format(row['stda']))
                        x.append(form.format(row['na']))
                    else:
                        x.append('&&')
                    if row['ni'] != 0:
                        x.append(form.format(row['vi']))
                        x.append(form.format(row['stdi']))
                        x.append(form.format(row['ni']))
                    else:
                        x.append('&&')
                    x.append(row['cell'].replace('Oocyte', 'Ooc.'))
                    x.append(seq(row['sequence']))
                    x.append(row['beta1'])
                    f.write(' & '.join(x) + ' \\\\\n')
            # Footer
            f.write('\\end{longtable}\n')
            f.write('\\end{' + size + '}\n')