Exemplo n.º 1
0
def load_genes(geneinter_file='', ignore_file='', squaring=True):
    '''
    Loads all of the gene pairs and their corresponding interaction scores
    into memory. It also keeps a set of all genes for iterative purposes.

    There is some criteria for excluding genes from this process:
      1) If an ignore gene list file is provided, any gene in that file
         is excluded from the set of genes used.
      2) If an interaction score is zero, it is *KEPT* in the set of
         genes used to generate BPMs with an interaction score of 0.

    This gene information is then available at the 'geneint' module level, 
    since they are both used pervasively throughout BPM generation.

    Finally, if we add the gene pair (g1, g2) with score S to the dictionary,
    then we'll also add (g2, g1) with score S to the dictionary. This increases
    memory usage but saves cpu cycles when looking up interaction scores.
    Basically, we force the dictionary to be a reflexive matrix.
    '''
    ignore = set()
    if conf.ignore:
        if conf is not None:
            for line in gzipOpen(conf.ignore):
                ignore.add(line.strip())
        else:
            for line in gzipOpen(ignore_file):
                ignore.add(line.strip())

    if conf is not None:
        reader = csv.reader(gzipOpen(conf.geneinter), delimiter='\t')
    else:
        reader = csv.reader(gzipOpen(geneinter_file), delimiter='\t')
    for row in reader:
        g1, g2, intscore = row[0], row[1], row[2]

        genespace.add(g1)
        genespace.add(g2)

        # Ignore pairs where one or both genes are in the ignore gene list
        if g1 in ignore or g2 in ignore:
            continue

        # If there is no interaction score, force it to be 0
        try:
            ginter = float(intscore)
        except ValueError:
            ginter = 0.0

        if (conf is not None and conf.squaring) or squaring:
            if ginter < 0:
                ginter = - (ginter ** 2)
            else:
                ginter = ginter ** 2
        gis[(g1, g2)] = ginter
        gis[(g2, g1)] = ginter

        genes.add(g1)
        genes.add(g2)

    parallel.inc_counter(parallel.costs['load_genes'])
Exemplo n.º 2
0
def group_genes((i, g1)):
    '''
    group_genes is applied to every gene, and a BPM is generated from *every*
    gene. In particular, given M happy bipartitions, generate a BPM where
    the first module contains all genes that appeared in the same set in the M
    bipartitions C% of the time and the second module contains all genes
    that appeared in the opposite set in the M bipartitions C% of the time.
    '''
    mod1, mod2 = [], []

    for g2 in geneinter.genes:
        # Count the number of times g2 is in the same set as g2
        freqsame = sum([
            1 for A, B in happyparts
            if (g1 in A and g2 in A) or (g1 in B and g2 in B)
        ])

        ratio = float(freqsame) / conf.M
        if ratio >= conf.C:
            mod1.append(g2)
        elif (1 - ratio) >= conf.C:
            mod2.append(g2)

    parallel.inc_counter()
    parallel.print_progress()

    return set(mod1), set(mod2)
Exemplo n.º 3
0
def load_genes(geneinter_file='', ignore_file='', squaring=True):
    '''
    Loads all of the gene pairs and their corresponding interaction scores
    into memory. It also keeps a set of all genes for iterative purposes.

    There is some criteria for excluding genes from this process:
      1) If an ignore gene list file is provided, any gene in that file
         is excluded from the set of genes used.
      2) If an interaction score is zero, it is *KEPT* in the set of
         genes used to generate BPMs with an interaction score of 0.

    This gene information is then available at the 'geneint' module level, 
    since they are both used pervasively throughout BPM generation.

    Finally, if we add the gene pair (g1, g2) with score S to the dictionary,
    then we'll also add (g2, g1) with score S to the dictionary. This increases
    memory usage but saves cpu cycles when looking up interaction scores.
    Basically, we force the dictionary to be a reflexive matrix.
    '''
    ignore = set()
    if conf.ignore:
        if conf is not None:
            for line in gzipOpen(conf.ignore):
                ignore.add(line.strip())
        else:
            for line in gzipOpen(ignore_file):
                ignore.add(line.strip())

    if conf is not None:
        reader = csv.reader(gzipOpen(conf.geneinter), delimiter='\t')
    else:
        reader = csv.reader(gzipOpen(geneinter_file), delimiter='\t')
    for row in reader:
        g1, g2, intscore = row[0], row[1], row[2]

        genespace.add(g1)
        genespace.add(g2)

        # Ignore pairs where one or both genes are in the ignore gene list
        if g1 in ignore or g2 in ignore:
            continue

        # If there is no interaction score, force it to be 0
        try:
            ginter = float(intscore)
        except ValueError:
            ginter = 0.0

        if (conf is not None and conf.squaring) or squaring:
            if ginter < 0:
                ginter = -(ginter**2)
            else:
                ginter = ginter**2
        gis[(g1, g2)] = ginter
        gis[(g2, g1)] = ginter

        genes.add(g1)
        genes.add(g2)

    parallel.inc_counter(parallel.costs['load_genes'])
Exemplo n.º 4
0
def group_genes((i, g1)):
    '''
    group_genes is applied to every gene, and a BPM is generated from *every*
    gene. In particular, given M happy bipartitions, generate a BPM where
    the first module contains all genes that appeared in the same set in the M
    bipartitions C% of the time and the second module contains all genes
    that appeared in the opposite set in the M bipartitions C% of the time.
    '''
    mod1, mod2 = [], []

    for g2 in geneinter.genes:
        # Count the number of times g2 is in the same set as g2
        freqsame = sum([1 for A, B in happyparts
                          if (g1 in A and g2 in A) or (g1 in B and g2 in B)])

        ratio = float(freqsame) / conf.M
        if ratio >= conf.C:
            mod1.append(g2)
        elif (1 - ratio) >= conf.C:
            mod2.append(g2)

    parallel.inc_counter()
    parallel.print_progress()

    return set(mod1), set(mod2)
Exemplo n.º 5
0
def localmaxcut(m):
    '''
    Generates a random bipartition and makes the bipartition 'happy' by
    applying 'Weighted-Flip' (from Leiserson et al., 2011) until there are no
    unhappy genes left.
    '''
    A, B = random_bipartition()

    same_set = lambda g1, g2: (g1 in A and g2 in A) or (g1 in B and g2 in B)

    def weights(g1):
        '''
        Calculates the total neighboring weight of 'g1'. The total
        neighboring weight is a tuple of the sum of interactions in the same
        set as g1 and the sum of interactions in the opposite set as g1.

        The tuple in this case is represented by a dictionary with keys
        'same' and 'other'. I'm using a dictionary because the values need
        to be mutable; they change as we move vertices between the partitions.
        '''
        ws = {'same': 0, 'other': 0}
        for g2 in geneinter.genes:
            w = geneinter.gi(g1, g2)
            if same_set(g1, g2):
                ws['same'] += w
            else:
                ws['other'] += w
        return ws

    nweights = {g: weights(g) for g in geneinter.genes}
    unhappy = get_unhappy(nweights)

    while unhappy:
        v = random.choice(unhappy)

        if v in A:
            A.remove(v)
            B.add(v)
        else:
            A.add(v)
            B.remove(v)

        # This loop eliminates the need to recalculate 'weights' for every
        # gene again, which is O(n^2) in the number of genes. This loop is
        # O(n) but comes at the cost of clarity.
        #
        # The idea is to modify the weights of every other interacting gene and
        # to switch the 'same' and 'other' scores of the gene that was made
        # happy.
        for g, nw in nweights.iteritems():
            if g == v:
                nw['same'], nw['other'] = nw['other'], nw['same']
                continue

            # The interaction score between this gene and the gene that
            # was made happy.
            w = geneinter.gi(v, g)

            # If the two genes are now in the same set, then 'g' gets a boost
            # to its happiness. Otherwise, 'g' becomes more unhappy.
            if same_set(v, g):
                nw['same'] += w
                nw['other'] -= w
            else:
                nw['same'] -= w
                nw['other'] += w

        # Refresh the unhappy list
        unhappy = get_unhappy(nweights)

    parallel.inc_counter()
    parallel.print_progress()

    return A, B
Exemplo n.º 6
0

def enrich(modulecnt, (bpmi, modi, genes)):
    '''
    Initiates a request to Funcassociate and returns a dictionary of goterms.

    :param modulecnt: The total number of modules in the BPM file.
    :param bpmi, modi, genes: A tuple representing a module. 'bpmi' is the
                                BPM index number, 'modi' is the module index
                                number, and 'genes' is a list of gene names
                                in the module.
    :return: A four-tuple of the input module and its associated go terms.
    '''
    goterms = faread.functionate(genes, min(10000, max(1000, modulecnt)))

    parallel.inc_counter()
    parallel.print_progress()

    return bpmi, modi, genes, goterms


def sortgo(goterms):
    '''
    Sorts the keys of a goterms dictionary according to the current
    configuration.
    '''
    if conf is None:
        reverse = False
        sort_by = 'p'
    else:
        reverse = conf.order_go == 'desc'
Exemplo n.º 7
0
from bpm import conf, faread, parallel

def enrich(modulecnt, (bpmi, modi, genes)):
    '''
    Initiates a request to Funcassociate and returns a dictionary of goterms.

    :param modulecnt: The total number of modules in the BPM file.
    :param bpmi, modi, genes: A tuple representing a module. 'bpmi' is the
                                BPM index number, 'modi' is the module index
                                number, and 'genes' is a list of gene names
                                in the module.
    :return: A four-tuple of the input module and its associated go terms.
    '''
    goterms = faread.functionate(genes, min(10000, max(1000, modulecnt)))

    parallel.inc_counter()
    parallel.print_progress()

    return bpmi, modi, genes, goterms

def sortgo(goterms):
    '''
    Sorts the keys of a goterms dictionary according to the current
    configuration.
    '''
    if conf is None:
        reverse = False
        sort_by = 'p'
    else:
        reverse = conf.order_go == 'desc'
        sort_by = conf.sort_go_by
Exemplo n.º 8
0
def localmaxcut(m):
    '''
    Generates a random bipartition and makes the bipartition 'happy' by
    applying 'Weighted-Flip' (from Leiserson et al., 2011) until there are no
    unhappy genes left.
    '''
    A, B = random_bipartition()

    same_set = lambda g1, g2: (g1 in A and g2 in A) or (g1 in B and g2 in B)
    def weights(g1):
        '''
        Calculates the total neighboring weight of 'g1'. The total
        neighboring weight is a tuple of the sum of interactions in the same
        set as g1 and the sum of interactions in the opposite set as g1.

        The tuple in this case is represented by a dictionary with keys
        'same' and 'other'. I'm using a dictionary because the values need
        to be mutable; they change as we move vertices between the partitions.
        '''
        ws = { 'same': 0, 'other': 0 }
        for g2 in geneinter.genes:
            w = geneinter.gi(g1, g2)
            if same_set(g1, g2):
                ws['same'] += w
            else:
                ws['other'] += w
        return ws

    nweights = { g: weights(g) for g in geneinter.genes }
    unhappy = get_unhappy(nweights)

    while unhappy:
        v = random.choice(unhappy)

        if v in A:
            A.remove(v)
            B.add(v)
        else:
            A.add(v)
            B.remove(v)

        # This loop eliminates the need to recalculate 'weights' for every
        # gene again, which is O(n^2) in the number of genes. This loop is
        # O(n) but comes at the cost of clarity.
        #
        # The idea is to modify the weights of every other interacting gene and
        # to switch the 'same' and 'other' scores of the gene that was made
        # happy.
        for g, nw in nweights.iteritems():
            if g == v:
                nw['same'], nw['other'] = nw['other'], nw['same']
                continue

            # The interaction score between this gene and the gene that
            # was made happy.
            w = geneinter.gi(v, g) 

            # If the two genes are now in the same set, then 'g' gets a boost
            # to its happiness. Otherwise, 'g' becomes more unhappy.
            if same_set(v, g):
                nw['same'] += w
                nw['other'] -= w
            else:
                nw['same'] -= w
                nw['other'] += w

        # Refresh the unhappy list
        unhappy = get_unhappy(nweights)

    parallel.inc_counter()
    parallel.print_progress()

    return A, B