def __init__(self, fn): # Initialize attributes self.fn = fn # db filename self.db = bidict({}) # otu <-> seq self.size = {} # otu -> size # Load sequence database self = self.load_db()
def to_tensor(self, always_all_nodes=True): """ Return a tensor representation Compute the list of matrices corresponding to each graph, with nodes ordered in a same order And the dic of nodes corresponding and the list for each sn of nodes :param always_all_nodes: if True, even if a node is not active during a snapshot, it is included in the matrix :return: 3 elements:(A,B,C) A: list of numpy matrices, B: a bidictionary {node name:node order in the matrix}, C: active node at each step, as a list of list of nodes """ allNodes = list(self.aggregate().nodes().keys()) nodeIDdict = bidict() for i in range(len(allNodes)): nodeIDdict[allNodes[i]] = i + 1 # Create a dynamic network as sequence of nx graphs Gs = list(self.snapshots().values()) nodesPresent = [] GsMat = [] nodeIdOrderedList = list(nodeIDdict.keys()) for g0 in Gs: g2 = g0.copy() # get nodes of the current graph ordered according to the global order filteredOrderedNodes = [ x for x in nodeIDdict.keys() if x in g2.nodes ] # transform to numpy matrix if (always_all_nodes): g2.add_nodes_presence_from(nodeIdOrderedList) GsMat.append( nx.to_numpy_matrix(g2, nodelist=nodeIdOrderedList).tolist()) nodesPresent.append( [nodeIDdict[name] for name in filteredOrderedNodes]) return (GsMat, nodeIDdict, nodesPresent)
def compute_oligotype_table(raw_trimmed, raw_dereplicated, clustering_file, separator, oligotype_table_filename): # Inputs: # 'raw_trimmed' = raw reads, trimmed to final form (FASTA) # 'raw_dereplicated' = dereplicated reads (FASTA) # 'clustering_file' = output clustering file from 'usearch8 -cluster_otus' (-parseout option) # 'separator' = separator character, e.g. '_' for '>sampleID_sequenceNumber' sequence IDs # 'oligotype_table_filename' = output filename # # 1. get sequence counts # x has x[seq][sample] = count (of that sequence in that sample), # where seq is the ATCG sequence in raw_trimmed fasta file x, samples = pull_counts(raw_trimmed, separator) # 2. populate sequence lookup sequence_lookup = bidict({}) # seq <--> seqID iter_fst = util.iter_fst for record in iter_fst(raw_dereplicated): [sid, seq] = record[:2] sid = int(sid[1:].split(';')[0]) sequence_lookup[seq] = sid # 3. Populate clustering_lookup (from otu_clustering.tab) clustering_lookup = {} # seqID <--> 'otu' or 'match' OTU_lookup = {} # seqID <--> OTU_ID centroid with open(clustering_file, 'r') as fid: all_lines = fid.readlines() for line in all_lines: split_line = line.split() seqID = int(split_line[0].split(';')[0]) clustering_lookup[seqID] = split_line[1] if split_line[1] == 'match' or split_line[1] == "otu": OTU_lookup[seqID] = split_line[4] # 4. Populate dictionaries with each sequence within an OTU. Each of the three dictionaries contain lists whose # entries are ordered in the same manner in each dict. OTU_oligos = {} # OTU_ID <--> ['ACAGT','ACAAT', ...] OTU_original_seqIDs = {} # OTU_ID <--> [seqID1, seqID2, ...] (original sequence IDs for oligotypes) OTU_oligo_IDs = {} # OTU_ID <--> [0, 1, ...] (oligotype IDs) for seq in sequence_lookup: seqID = sequence_lookup[seq] if clustering_lookup[seqID] != "chimera": OTU_centroid = OTU_lookup[seqID] if OTU_centroid not in OTU_oligos: OTU_oligos[OTU_centroid] = [] OTU_oligo_IDs[OTU_centroid] = [] OTU_original_seqIDs[OTU_centroid] = [] if seq not in OTU_oligos[OTU_centroid]: OTU_oligos[OTU_centroid].append(seq) OTU_original_seqIDs[OTU_centroid].append(seqID) if len(OTU_oligo_IDs[OTU_centroid]) > 0: OTU_oligo_IDs[OTU_centroid].append(OTU_oligo_IDs[OTU_centroid][-1] + 1) else: OTU_oligo_IDs[OTU_centroid].append(0) # Create a full list of oligotypes oligotype_list = [] for OTU in OTU_oligo_IDs: for oligoID in OTU_oligo_IDs[OTU]: full_oligotype_ID = str(OTU) + '.' + str(oligoID) oligotype_list.append(full_oligotype_ID) # Get counts for each oligotype oligotype_counts = {} # Loop through each sequence in x and for each OTU, for seq in x: # assign oligotype_ID try: seqID = sequence_lookup[seq] if clustering_lookup[seqID] != "chimera": OTU_centroid = OTU_lookup[seqID] # Look up which oligotype ID it is index = OTU_oligos[OTU_centroid].index(seq) oligo_ID = OTU_oligo_IDs[OTU_centroid][index] full_oligotype_ID = str(OTU_centroid) + '.' + str(oligo_ID) # Get counts for each sample oligotype_counts[full_oligotype_ID] = x[seq] except: continue # Create an oligotype table, where each row is an oligotype (e.g. 1.0 or 1.1) and each column is a sample with open(oligotype_table_filename, 'w') as fid: firstline = "#Oligotype" + '\t' + '\t'.join(samples) fid.write(firstline + '\n') for full_oligotype_ID in oligotype_counts: oligo_abundances_per_sample = ['0']*len(samples) for sampleID in oligotype_counts[full_oligotype_ID]: index = samples.index(sampleID) # get index in samples vector oligo_abundances_per_sample[index] = str(oligotype_counts[full_oligotype_ID][sampleID]) line = full_oligotype_ID + '\t' + '\t'.join(oligo_abundances_per_sample) fid.write(line + '\n')
You do need to run pop() to intialize the dictionary for any of these functions to work""" import pandas as pd import bidict # allows a two way dictionary from bidict import bidict import customExceptions conventions = """ """ #################################################################################################### #################################################################################################### #Initializing objects, constants, definitions ########################### ## Create the libraries needed - long hand on the right, short hand as key (on left, first) ## Seperate libraries are needed forr each field. Initialized here province_dictionary = bidict() district_dictionary = bidict() city_dictionary = bidict() ward_dictionary = bidict () site_dictionary = bidict() sensor_dictionary = bidict() data_dictionary = bidict() #data title with description, mostly ignored select = { """a map of keywords to dictionary objects can be expanded to deal with case sensitivity""" 'province':province_dictionary , 'district':district_dictionary , 'city':city_dictionary , 'ward':ward_dictionary ,
def __init__(self, fn): # Initialize attributes self.fn = fn # db filename self.db = bidict({}) # otu <-> seq self.size = {} # otu -> size
__author__ = "sunmingming01" import bidict PADDING_POS_STR = "$$$$" pos_id_map = bidict({PADDING_POS_STR: 0}) # reserved for padding str def alloc_pos_id(pos): if pos not in pos_id_map: cur_top_idx = len(pos_id_map) pos_id_map[pos] = cur_top_idx return pos_id_map[pos] class Pos(object): def __init__(self, content): self.id = alloc_pos_id(content) self.content = content @classmethod def padding_pos(cls): padding_pos = Pos(PADDING_POS_STR) return padding_pos @classmethod
def compute_oligotype_table(raw_trimmed, raw_dereplicated, clustering_file, separator, oligotype_table_filename): # Inputs: # 'raw_trimmed' = raw reads, trimmed to final form (FASTA) # 'raw_dereplicated' = dereplicated reads (FASTA) # 'clustering_file' = output clustering file from 'usearch8 -cluster_otus' (-parseout option) # 'separator' = separator character, e.g. '_' for '>sampleID_sequenceNumber' sequence IDs # 'oligotype_table_filename' = output filename # # 1. get sequence counts # x has x[seq][sample] = count (of that sequence in that sample), # where seq is the ATCG sequence in raw_trimmed fasta file x, samples = pull_counts(raw_trimmed, separator) # 2. populate sequence lookup sequence_lookup = bidict({}) # seq <--> seqID iter_fst = util.iter_fst for record in iter_fst(raw_dereplicated): [sid, seq] = record[:2] sid = int(sid[1:].split(';')[0]) sequence_lookup[seq] = sid # 3. Populate clustering_lookup (from otu_clustering.tab) clustering_lookup = {} # seqID <--> 'otu' or 'match' OTU_lookup = {} # seqID <--> OTU_ID centroid with open(clustering_file, 'r') as fid: all_lines = fid.readlines() for line in all_lines: split_line = line.split() seqID = int(split_line[0].split(';')[0]) clustering_lookup[seqID] = split_line[1] if split_line[1] == 'match' or split_line[1] == "otu": OTU_lookup[seqID] = split_line[4] # 4. Populate dictionaries with each sequence within an OTU. Each of the three dictionaries contain lists whose # entries are ordered in the same manner in each dict. OTU_oligos = {} # OTU_ID <--> ['ACAGT','ACAAT', ...] OTU_original_seqIDs = { } # OTU_ID <--> [seqID1, seqID2, ...] (original sequence IDs for oligotypes) OTU_oligo_IDs = {} # OTU_ID <--> [0, 1, ...] (oligotype IDs) for seq in sequence_lookup: seqID = sequence_lookup[seq] if clustering_lookup[seqID] != "chimera": OTU_centroid = OTU_lookup[seqID] if OTU_centroid not in OTU_oligos: OTU_oligos[OTU_centroid] = [] OTU_oligo_IDs[OTU_centroid] = [] OTU_original_seqIDs[OTU_centroid] = [] if seq not in OTU_oligos[OTU_centroid]: OTU_oligos[OTU_centroid].append(seq) OTU_original_seqIDs[OTU_centroid].append(seqID) if len(OTU_oligo_IDs[OTU_centroid]) > 0: OTU_oligo_IDs[OTU_centroid].append( OTU_oligo_IDs[OTU_centroid][-1] + 1) else: OTU_oligo_IDs[OTU_centroid].append(0) # Create a full list of oligotypes oligotype_list = [] for OTU in OTU_oligo_IDs: for oligoID in OTU_oligo_IDs[OTU]: full_oligotype_ID = str(OTU) + '.' + str(oligoID) oligotype_list.append(full_oligotype_ID) # Get counts for each oligotype oligotype_counts = {} # Loop through each sequence in x and for each OTU, for seq in x: # assign oligotype_ID try: seqID = sequence_lookup[seq] if clustering_lookup[seqID] != "chimera": OTU_centroid = OTU_lookup[seqID] # Look up which oligotype ID it is index = OTU_oligos[OTU_centroid].index(seq) oligo_ID = OTU_oligo_IDs[OTU_centroid][index] full_oligotype_ID = str(OTU_centroid) + '.' + str(oligo_ID) # Get counts for each sample oligotype_counts[full_oligotype_ID] = x[seq] except: continue # Create an oligotype table, where each row is an oligotype (e.g. 1.0 or 1.1) and each column is a sample with open(oligotype_table_filename, 'w') as fid: firstline = "#Oligotype" + '\t' + '\t'.join(samples) fid.write(firstline + '\n') for full_oligotype_ID in oligotype_counts: oligo_abundances_per_sample = ['0'] * len(samples) for sampleID in oligotype_counts[full_oligotype_ID]: index = samples.index(sampleID) # get index in samples vector oligo_abundances_per_sample[index] = str( oligotype_counts[full_oligotype_ID][sampleID]) line = full_oligotype_ID + '\t' + '\t'.join( oligo_abundances_per_sample) fid.write(line + '\n')
Expr('Beth'), Expr('Bob'), Expr('Carol'), Expr('Lisa'), Expr('Amy'), Expr('Ellis'), Expr('Lorde') } PROPOSITIONS = list( "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz" ) NUMBERS = list("123456789012345678901234567890123456789012345678901234567890") domain = list() var_s = list() terms = set() mapping = bidict({}) def eliminate_functions_from(expression): """Given a functional expression, returns an equivalent expression in relational form e.g. given expression: (F(x) ~= Adam) | (M(x) = Beth) returns: -F(x,Adam) | M(x,Beth)""" s = expr(expression) #Make sure to convert an argument to type-expression if not s.args or is_symbol(s.op): #If a literal is received just return it return s args = list(map(eliminate_functions_from, s.args)) if s.op == '==' or s.op == '~=': #recognise function related operators rel = increase_arity(list(s.args), oper=s.op) #convert the function into relation return Expr(rel.op, *tuple(rel.args)) else: