Exemplo n.º 1
0
	def __init__(self, fn):

		# Initialize attributes
		self.fn = fn # db filename
		self.db = bidict({}) # otu <-> seq
		self.size = {} # otu -> size
		
		# Load sequence database
		self = self.load_db()
Exemplo n.º 2
0
    def to_tensor(self, always_all_nodes=True):
        """
        Return a tensor representation

        Compute the list of matrices corresponding to each graph, with nodes ordered in a same order
        And the dic of nodes corresponding
        and the list for each sn of nodes
        :param always_all_nodes: if True, even if a node is not active during a snapshot, it is included in the matrix
        :return: 3 elements:(A,B,C) A: list of numpy matrices, B: a bidictionary {node name:node order in the matrix}, C: active node at each step, as a list of list of nodes
        """
        allNodes = list(self.aggregate().nodes().keys())
        nodeIDdict = bidict()
        for i in range(len(allNodes)):
            nodeIDdict[allNodes[i]] = i + 1

        # Create a dynamic network as sequence of nx graphs
        Gs = list(self.snapshots().values())
        nodesPresent = []
        GsMat = []

        nodeIdOrderedList = list(nodeIDdict.keys())
        for g0 in Gs:
            g2 = g0.copy()
            # get nodes of the current graph ordered according to the global order
            filteredOrderedNodes = [
                x for x in nodeIDdict.keys() if x in g2.nodes
            ]

            # transform to numpy matrix
            if (always_all_nodes):
                g2.add_nodes_presence_from(nodeIdOrderedList)
            GsMat.append(
                nx.to_numpy_matrix(g2, nodelist=nodeIdOrderedList).tolist())
            nodesPresent.append(
                [nodeIDdict[name] for name in filteredOrderedNodes])
        return (GsMat, nodeIDdict, nodesPresent)
def compute_oligotype_table(raw_trimmed, raw_dereplicated, clustering_file, separator, oligotype_table_filename):
    # Inputs: 
    #       'raw_trimmed' = raw reads, trimmed to final form (FASTA)
    #       'raw_dereplicated' = dereplicated reads (FASTA)
    #       'clustering_file' = output clustering file from 'usearch8 -cluster_otus' (-parseout option) 
    #       'separator' = separator character, e.g. '_' for '>sampleID_sequenceNumber' sequence IDs
    #       'oligotype_table_filename' = output filename
    #
    # 1. get sequence counts
    #     x has x[seq][sample] = count (of that sequence in that sample),
    #     where seq is the ATCG sequence in raw_trimmed fasta file
    x, samples = pull_counts(raw_trimmed, separator)

    # 2. populate sequence lookup 
    sequence_lookup = bidict({})    # seq <--> seqID
    iter_fst = util.iter_fst
    for record in iter_fst(raw_dereplicated):
        [sid, seq] = record[:2]
        sid = int(sid[1:].split(';')[0])
        sequence_lookup[seq] = sid

    # 3.  Populate clustering_lookup (from otu_clustering.tab)
    clustering_lookup = {}  # seqID <--> 'otu' or 'match'
    OTU_lookup = {}         # seqID <--> OTU_ID centroid
    with open(clustering_file, 'r') as fid:
        all_lines = fid.readlines()
        for line in all_lines:
            split_line = line.split()
            seqID = int(split_line[0].split(';')[0])
            clustering_lookup[seqID] = split_line[1]
            if split_line[1] == 'match' or split_line[1] == "otu":
                OTU_lookup[seqID] = split_line[4]
    
    # 4.  Populate dictionaries with each sequence within an OTU.  Each of the three dictionaries contain lists whose 
    # entries are ordered in the same manner in each dict.

    OTU_oligos = {}         # OTU_ID <--> ['ACAGT','ACAAT', ...] 
    OTU_original_seqIDs = {}  # OTU_ID <--> [seqID1, seqID2, ...] (original sequence IDs for oligotypes)
    OTU_oligo_IDs = {}      # OTU_ID <--> [0, 1, ...] (oligotype IDs)
    for seq in sequence_lookup:
        seqID = sequence_lookup[seq]
        if clustering_lookup[seqID] != "chimera":
            OTU_centroid = OTU_lookup[seqID]
            if OTU_centroid not in OTU_oligos:
                OTU_oligos[OTU_centroid] = []
                OTU_oligo_IDs[OTU_centroid] = []
                OTU_original_seqIDs[OTU_centroid] = []
            if seq not in OTU_oligos[OTU_centroid]:
                OTU_oligos[OTU_centroid].append(seq)
                OTU_original_seqIDs[OTU_centroid].append(seqID)
                if len(OTU_oligo_IDs[OTU_centroid]) > 0:
                    OTU_oligo_IDs[OTU_centroid].append(OTU_oligo_IDs[OTU_centroid][-1] + 1)
                else:
                    OTU_oligo_IDs[OTU_centroid].append(0)


    # Create a full list of oligotypes
    oligotype_list = []
    for OTU in OTU_oligo_IDs:
        for oligoID in OTU_oligo_IDs[OTU]:
            full_oligotype_ID = str(OTU) + '.' + str(oligoID)
            oligotype_list.append(full_oligotype_ID)

    # Get counts for each oligotype
    oligotype_counts = {}
    # Loop through each sequence in x and for each OTU,
    for seq in x:
        # assign oligotype_ID
        try:
            seqID = sequence_lookup[seq]
            if clustering_lookup[seqID] != "chimera":
                OTU_centroid = OTU_lookup[seqID]
                # Look up which oligotype ID it is
                index = OTU_oligos[OTU_centroid].index(seq)
                oligo_ID = OTU_oligo_IDs[OTU_centroid][index]
                full_oligotype_ID = str(OTU_centroid) + '.' + str(oligo_ID)
                # Get counts for each sample
                oligotype_counts[full_oligotype_ID] = x[seq]
        except:
            continue


    # Create an oligotype table, where each row is an oligotype (e.g. 1.0 or 1.1) and each column is a sample    
    with open(oligotype_table_filename, 'w') as fid:
        firstline = "#Oligotype" + '\t' + '\t'.join(samples)
        fid.write(firstline + '\n')
        for full_oligotype_ID in oligotype_counts:
            oligo_abundances_per_sample = ['0']*len(samples)
            for sampleID in oligotype_counts[full_oligotype_ID]:
                index = samples.index(sampleID) # get index in samples vector
                oligo_abundances_per_sample[index] = str(oligotype_counts[full_oligotype_ID][sampleID])
            line = full_oligotype_ID + '\t' + '\t'.join(oligo_abundances_per_sample)
            fid.write(line + '\n')
You do need to run pop() to intialize the dictionary for any of these functions to work"""
import pandas as pd
import bidict # allows a two way dictionary
from bidict import bidict
import customExceptions

conventions = """ """

####################################################################################################
####################################################################################################
#Initializing objects, constants, definitions
###########################

## Create the libraries needed - long hand on the right, short hand as key (on left, first)
## Seperate libraries are needed forr each field. Initialized here
province_dictionary = bidict()
district_dictionary  = bidict()
city_dictionary  = bidict()
ward_dictionary  = bidict ()
site_dictionary  = bidict()
sensor_dictionary  = bidict()
data_dictionary  = bidict() #data title with description, mostly ignored


select  = {
"""a map of keywords to dictionary objects
 can be expanded to deal with case sensitivity"""
	'province':province_dictionary ,
	'district':district_dictionary ,
	'city':city_dictionary ,
	'ward':ward_dictionary ,
Exemplo n.º 5
0
    def __init__(self, fn):

        # Initialize attributes
        self.fn = fn  # db filename
        self.db = bidict({})  # otu <-> seq
        self.size = {}  # otu -> size
Exemplo n.º 6
0
__author__ = "sunmingming01"


import bidict


PADDING_POS_STR = "$$$$"

pos_id_map = bidict({PADDING_POS_STR: 0})  # reserved for padding str


def alloc_pos_id(pos):
    if pos not in pos_id_map:
        cur_top_idx = len(pos_id_map)
        pos_id_map[pos] = cur_top_idx

    return pos_id_map[pos]


class Pos(object):
    def __init__(self, content):
        self.id = alloc_pos_id(content)
        self.content = content

    @classmethod
    def padding_pos(cls):

        padding_pos = Pos(PADDING_POS_STR)
        return padding_pos

    @classmethod
def compute_oligotype_table(raw_trimmed, raw_dereplicated, clustering_file,
                            separator, oligotype_table_filename):
    # Inputs:
    #       'raw_trimmed' = raw reads, trimmed to final form (FASTA)
    #       'raw_dereplicated' = dereplicated reads (FASTA)
    #       'clustering_file' = output clustering file from 'usearch8 -cluster_otus' (-parseout option)
    #       'separator' = separator character, e.g. '_' for '>sampleID_sequenceNumber' sequence IDs
    #       'oligotype_table_filename' = output filename
    #
    # 1. get sequence counts
    #     x has x[seq][sample] = count (of that sequence in that sample),
    #     where seq is the ATCG sequence in raw_trimmed fasta file
    x, samples = pull_counts(raw_trimmed, separator)

    # 2. populate sequence lookup
    sequence_lookup = bidict({})  # seq <--> seqID
    iter_fst = util.iter_fst
    for record in iter_fst(raw_dereplicated):
        [sid, seq] = record[:2]
        sid = int(sid[1:].split(';')[0])
        sequence_lookup[seq] = sid

    # 3.  Populate clustering_lookup (from otu_clustering.tab)
    clustering_lookup = {}  # seqID <--> 'otu' or 'match'
    OTU_lookup = {}  # seqID <--> OTU_ID centroid
    with open(clustering_file, 'r') as fid:
        all_lines = fid.readlines()
        for line in all_lines:
            split_line = line.split()
            seqID = int(split_line[0].split(';')[0])
            clustering_lookup[seqID] = split_line[1]
            if split_line[1] == 'match' or split_line[1] == "otu":
                OTU_lookup[seqID] = split_line[4]

    # 4.  Populate dictionaries with each sequence within an OTU.  Each of the three dictionaries contain lists whose
    # entries are ordered in the same manner in each dict.

    OTU_oligos = {}  # OTU_ID <--> ['ACAGT','ACAAT', ...]
    OTU_original_seqIDs = {
    }  # OTU_ID <--> [seqID1, seqID2, ...] (original sequence IDs for oligotypes)
    OTU_oligo_IDs = {}  # OTU_ID <--> [0, 1, ...] (oligotype IDs)
    for seq in sequence_lookup:
        seqID = sequence_lookup[seq]
        if clustering_lookup[seqID] != "chimera":
            OTU_centroid = OTU_lookup[seqID]
            if OTU_centroid not in OTU_oligos:
                OTU_oligos[OTU_centroid] = []
                OTU_oligo_IDs[OTU_centroid] = []
                OTU_original_seqIDs[OTU_centroid] = []
            if seq not in OTU_oligos[OTU_centroid]:
                OTU_oligos[OTU_centroid].append(seq)
                OTU_original_seqIDs[OTU_centroid].append(seqID)
                if len(OTU_oligo_IDs[OTU_centroid]) > 0:
                    OTU_oligo_IDs[OTU_centroid].append(
                        OTU_oligo_IDs[OTU_centroid][-1] + 1)
                else:
                    OTU_oligo_IDs[OTU_centroid].append(0)

    # Create a full list of oligotypes
    oligotype_list = []
    for OTU in OTU_oligo_IDs:
        for oligoID in OTU_oligo_IDs[OTU]:
            full_oligotype_ID = str(OTU) + '.' + str(oligoID)
            oligotype_list.append(full_oligotype_ID)

    # Get counts for each oligotype
    oligotype_counts = {}
    # Loop through each sequence in x and for each OTU,
    for seq in x:
        # assign oligotype_ID
        try:
            seqID = sequence_lookup[seq]
            if clustering_lookup[seqID] != "chimera":
                OTU_centroid = OTU_lookup[seqID]
                # Look up which oligotype ID it is
                index = OTU_oligos[OTU_centroid].index(seq)
                oligo_ID = OTU_oligo_IDs[OTU_centroid][index]
                full_oligotype_ID = str(OTU_centroid) + '.' + str(oligo_ID)
                # Get counts for each sample
                oligotype_counts[full_oligotype_ID] = x[seq]
        except:
            continue

    # Create an oligotype table, where each row is an oligotype (e.g. 1.0 or 1.1) and each column is a sample
    with open(oligotype_table_filename, 'w') as fid:
        firstline = "#Oligotype" + '\t' + '\t'.join(samples)
        fid.write(firstline + '\n')
        for full_oligotype_ID in oligotype_counts:
            oligo_abundances_per_sample = ['0'] * len(samples)
            for sampleID in oligotype_counts[full_oligotype_ID]:
                index = samples.index(sampleID)  # get index in samples vector
                oligo_abundances_per_sample[index] = str(
                    oligotype_counts[full_oligotype_ID][sampleID])
            line = full_oligotype_ID + '\t' + '\t'.join(
                oligo_abundances_per_sample)
            fid.write(line + '\n')
Exemplo n.º 8
0
    Expr('Beth'),
    Expr('Bob'),
    Expr('Carol'),
    Expr('Lisa'),
    Expr('Amy'),
    Expr('Ellis'),
    Expr('Lorde')
}
PROPOSITIONS = list(
    "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"
)
NUMBERS = list("123456789012345678901234567890123456789012345678901234567890")
domain = list()
var_s = list()
terms = set()
mapping = bidict({})


def eliminate_functions_from(expression):
    """Given a functional expression, returns an equivalent expression in relational form
    e.g. given expression: (F(x) ~= Adam) | (M(x) = Beth)
    returns: -F(x,Adam) | M(x,Beth)"""
    s = expr(expression)  #Make sure to convert an argument to type-expression
    if not s.args or is_symbol(s.op):  #If a literal is received just return it
        return s
    args = list(map(eliminate_functions_from, s.args))
    if s.op == '==' or s.op == '~=':  #recognise function related operators
        rel = increase_arity(list(s.args),
                             oper=s.op)  #convert the function into relation
        return Expr(rel.op, *tuple(rel.args))
    else:
Exemplo n.º 9
0
	def __init__(self, fn):

		# Initialize attributes
		self.fn = fn # db filename
		self.db = bidict({}) # otu <-> seq
		self.size = {} # otu -> size