Exemplo n.º 1
0
def convert_timeseries_to_intervalseries(timeseries, yaxis_only=False):
    '''
    Will note accept any negative intervals (since that shouldn't be possible
    input:
        timeseries: [[numeric_date_from_start, arbitrary value of interest], []....
        yaxis_only: False, by default. if True then the return is [20, 6, ...]
    output:
        intervalseries: [[0, 20], [1, 6], ...

    Outputs the orderd series of gaps between dates
    '''
    intervalseries = []
    for i, dtpoint in enumerate(timeseries[:-1]):
        #Unpack the dates
        idate = dtpoint[0]
        jdate = timeseries[i+1][0]
        #Perform the calculation
        interval = jdate - idate
        #Go through the options, break if it is negative
        if interval < 0:
            m = 'Negative interval detected, this should not be an out of order timeseries'
            gerr.generic_error_handler(message = m)
        elif yaxis_only:
            intervalseries.append(interval)
        else:
            intervalseries.append([i, interval])
    return intervalseries
Exemplo n.º 2
0
def from_list(settings,
              data,
              naming_scheme,
              exclusions=[],
              chunk_size=100000,
              inclusion={}):
    '''
    input:
        settings - mongo connection settings (mongoConnect standard)
        data - list of lists. must correspond to headers in nameing scheme
        naming_scheme - dictionary mapping inset list index to column/key name to use
        (optional)
        exclusions - headers to ignore in naming_scheme. Must be value in dictionary.
    output:
        None

    dependencies: gale
    '''
    import gale.databases.mongoConnect as mcxn
    import gale.general.errors as err

    def _transform_row(theader, trow):
        if inclusion:
            tdict = dict(inclusion)
        else:
            tdict = {}
        for i, tkey in theader.items():
            if trow[i].lower() != 'null':
                tdict[tkey] = trow[i]
        return tdict

    #Handle the exclusion first
    if exclusions:
        for tval in exclusions:
            itemset = [i for i, j in naming_scheme.items() if j == tval]
            if len(itemset) > 1:
                m = 'populateMongo.from_list, line 28\n'
                m += 'More than one value in naming scheme matches the exclusion value'
                err.generic_error_handler(message=m)
            del naming_scheme[itemset[0]]

    #Check to make sure all values are distinct
    if len(naming_scheme.values()) != len(list(set(naming_scheme.values()))):
        m = 'populateMongo.from_list, line 28\n'
        m += 'Values in naming scheme not unique'
        err.generic_error_handler(message=m)

    data = [_transform_row(naming_scheme, datarow) for datarow in data]

    #iterate over subsets of the list
    tdb = mcxn.MongoConnection(settings)
    if len(data) < chunk_size:
        tdb.collection.insert(data)
    else:
        for i in range(0, len(data), chunk_size):
            tdb.collection.insert(data[i:i + chunk_size])
    tdb.tearDown()
Exemplo n.º 3
0
def gini(data):
    '''
    Calculates the gini coefficient for a given dataset.
    input:
        data- list of values, either raw counts or frequencies. 
              Frequencies MUST sum to 1.0, otherwise will be transformed to frequencies
              If raw counts data will be transformed to frequencies.
    output:
        gini- float, from 0.0 to 1.0 (1.0 most likely never realized since it is
              only achieved in the limit)
    '''
    def _unit_area(height, value, width):
        '''
        Calculates a single bars area.
        Area is composed of two parts:
            The height of the bar up until that point
            The addition from the current value (calculated as a triangle)
        input:
            height: previous bar height or sum of values up to current value
            value: current value
            width: width of individual bar
        output:
            bar_area: area of current bar
        '''
        bar_area = (height * width) + ((value * width) / 2.)
        return bar_area

    #Fair area will always be 0.5 when frequencies are used
    fair_area = 0.5
    #Check that input data has non-zero values, if not throw an error
    datasum = float(sum(data))
    if datasum == 0:
        m = 'Data sum is 0.0.\nCannot calculate Gini coefficient for non-responsive population.'
        gerr.generic_error_handler(message=m)
    elif datasum < 0.99:
        m = 'Data sum is frequencies and less than 1.0.'
        gerr.generic_error_handler(message=m)
    #If data does not sum to 1.0 transform to frequencies
    elif datasum > 1.0:
        data = [x / datasum for x in data]
    #Calculate the area under the curve for the current dataset
    data.sort()
    width = 1 / float(len(data))
    height, area = 0.0, 0.0
    for value in data:
        area += _unit_area(height, value, width)
        height += value
    #Calculate the gini
    gini = (fair_area - area) / fair_area
    return gini
Exemplo n.º 4
0
def fold_change(obs, exp):
    '''
    Rescales the observation (either a single value or list) by the expected value
    Cannot accept zero as the expected value
    input:
        obs -- int/float or list of int/floats
        exp -- int/float
    output:
        norm -- int/float or list of int/floats
    '''
    if exp == 0:
        m = "Cannot accept zero as an expected value"
        gerr.generic_error_handler(message=m)
    elif type(obs) == list:
        norm = [ival / float(exp) for ival in obs]
    else:
        norm = obs / float(exp)
    return norm
Exemplo n.º 5
0
def create_all_combinations(n):
    '''
    Creates all possible combinations up to length k, where k equals
    the size of list n. Maximum size is 10. because this would just 
    get stupid otherwise
    input:
        n - list of values
    output:
        combs - of all combinations
    '''
    from itertools import combinations
    #exception
    if len(n) > 10:
        import gale.general.errors as gerr
        m = 'List length too long for this function'
        gerr.generic_error_handler(message=m)
    #Go through the sizes
    combs = []
    for i in range(len(n)):
        size = i + 1
        combs += combinations(n, size)
    return combs
Exemplo n.º 6
0
def parse_infomap(comfile, netfile='', hierarchy=True):
    '''
    Parses an infomap community file. 
    Returns the (now) standard mod2node and node2mod dictionaries
    If recursion is wanted then the will ...
    ****Network features still missing*****
    inputs:
        comfile- name of infomap community file
        netfile- network file, will have more information
        hierarchy- boolean, toplevel or all levels.
    outputs:
        mod2node - dictionary with module class
    '''
    import sys
    try:
        import networkx as nx
    except ImportError:
        hierarchy = False
        print >> sys.stderr, "NetworkX is not available" 
        print >> sys.stderr, "Any network features of the modules will not be calculated"
    import gale.general.errors as gerr


    class Module(object):
        '''
        The module class
        Contains information related to sub-modules, such as sub-hierarchy,
        connecting modules, connecting partners, size, and nodes
        '''
        def __init__(self):
            ##Level - 0, 1, 2, 3
            self.level = None
            ##Hierarchy 
            self.children = False
            self.submodules = []
            ##Size related
            self.nodes = []
            self.size = None
            ##Network characteristics
            #module:link strength
            self.connect_modules = {}
            #node: outside_node
            self.connect_nodes = {}

        def attribute_generators(self):
            self.size = len(self.nodes)

    def _reader(fname):
        '''
        Read the input file, ignore comments that are #
        '''
        data = []
        for line in open(comfile).readlines():
            if line[0]=='#':
                #Comment
                pass
            elif line=='' or line==' ' or line=='\t' or line=='\n':
                #Blank line
                pass
            else:
                sline = line.split()
                #First part is modules, then numeric, then node name
                modlisting = sline[0].split(':')
                del modlisting[-1]
                #Kill the double quotes, join anything split with aspace in the node name
                #nodename = ' '.join(sline[2:])[1:-1]
                nodename = sline[-1]
                data.append([modlisting, nodename])
        return data
    
    #Reference variables
    mod2node = {}
    node2mod = {}
    #Read in the community data
    comdata = _reader(comfile)
    for mods, node in comdata:
        #Start with the easy part first
        if node in node2mod:
            m='Duplicitous node identifier'
            gerr.generic_error_handler(message=m)
        #CHeck on hierarchy
        if hierarchy == True:
            modname = '-'.join(mods)
        else:
            modname = mods[0]
        #Add the node2mod
        node2mod[node] = modname
        #Go through the modules in the listing
        for i in range(len(mods)):
            tmod = '-'.join(mods[:i+1])
            #Start the class
            if tmod not in mod2node:
                mod2node[tmod] = Module()
                mod2node[tmod].level = i
                #Check if there are children
                if len(mods) > (i+1):
                    mod2node[tmod].children = True
                    mod2node[tmod].submodules.append(mods[i+1])
            #Class upkeeping
            mod2node[tmod].nodes.append(node)
    #class upkeeping
    for module in mod2node:
        mod2node[module].attribute_generators()
    return mod2node, node2mod