def distanceMatrixCorrelation(matrix1, matrix2, weights = None, collectComponents = False): """ :param matrix1: :param matrix2: :param weights: :return: mean, and STD of the Kendal Tau Distances between all rows, and sorted list of names in the order of better correlations """ size = matrix1.getSize() assert(size == matrix2.getSize()) assert((not weights) or (size == weights.getSize())) kendallList = [None] * size weightsAllOnes = [1.0] * size compDict = DefDict(list) compSet = set() if collectComponents: for vl in matrix1.getArray(): for v in vl: compSet.add(v) for i in range(size): components = DefDict(float) kendallList[i] = calculateWeightedKendall(matrix1[i], matrix2[i], weights = weights[i] if weights else None, components = components if collectComponents else None) for k in compSet: compDict[k].append(components[k]) sortedNames = sorted(zip(matrix1.names, kendallList), key = operator.itemgetter(1)) compList = None if collectComponents: compList = map(np.mean, map(operator.itemgetter(1), sorted(compDict.iteritems(), key = operator.itemgetter(0)))) return (np.mean(kendallList), np.std(kendallList), sortedNames, compList)
def dict_to_frame(gene_counts: defaultdict, index=None) -> pd.DataFrame: """ Combine all distributions in `gene_counts` into a data frame. """ index_cache = set() gene_series = {} genes = sorted(gene_counts.keys()) for gene in genes: s = dict_as_series(gene_counts[gene], index) gene_series[gene] = s index_cache = index_cache.union(set(s.index)) reindex = list(index_cache) reindex.sort() if index is not None and len(index) > len(reindex): reindex = index df = pd.DataFrame(gene_series, index=reindex, columns=genes).fillna(0).astype(int) is_digit = map( lambda x: True if isinstance(x, str) and x.isdigit() else False, df.index) if all(is_digit): reindex = list(map(int, df.index)) df.index = reindex return df.sort_index() return df
def frequency_search(freq: int, hist: defaultdict): word = [] for v, k in hist.items(): if k == freq: word.append(v) # print(word) return word
def fill_board_manhattan_distance_less_than( board: np.array, maximum: int, manhattan_distance_dict: collections.defaultdict) -> np.array: board_copy = board.copy() for index, distances in manhattan_distance_dict.items(): sum_distances = sum(map(lambda x: x['distance'], distances)) board_copy[index] = (1 if sum_distances < maximum else 0) return board_copy
def __init__(self, maxCount, mode): self.maxCount = maxCount self.mode = mode self.fileList = [] self.fileDict = {} self.fileCache = DefDict(list) self.hitCount = 0 self.xactCount = 0
def utils_str_length_bigger(st: collections.defaultdict): """Return the longest string length in a given list""" m = 0 for i in st.items(): i = i[1][0] if len(i) > m: m = len(i) return m
def ToCsv( path: pathlib.Path, vocab_counts: defaultdict, node_count: int, ): vocab_entries = sorted(vocab_counts.items(), key=lambda x: -x[1]) total_count = sum(vocab_counts.values()) cumfreq = 0 node_cumfreq = 0 with open(str(path), "w") as f: writer = csv.writer(f, delimiter="\t") writer.writerow( ("cumulative_frequency", "cumulative_node_frequency", "count", "text",) ) for text, count in vocab_entries: cumfreq += count / total_count node_cumfreq += count / node_count writer.writerow((cumfreq, node_cumfreq, count, text))
def get_embeddings_per_log(data: defaultdict, model: fasttext.FastText) -> np.ndarray: # create embeddings per log but at first remove '\n' (newline character) from the end embeddings = [ model.get_sentence_vector(log.rstrip()) for logs in data.values() for log in logs ] return np.asarray(embeddings)
def count_stats(stats_data: defaultdict): counter = {} for stat, inc in stats_data.items(): if stat is enum.Enum: stat = stat.value counter[stat] = len(inc) return sorted(counter.items(), key=lambda i: i[1], reverse=True)
def solution(gender_prefer: str, geo_limit: int, profile: defaultdict): results = [] for name in profile.keys(): if profile[name][0] in gender_prefer and int(profile[name][1]) <= geo_limit: results.append(name) if not results: return 'No one yet' return '\n'.join(sorted(results))
def _end(self, context: defaultdict, output_file: str) -> None: out_path = os.path.join(self.out_file, output_file + '.phrases.tsv') with open(out_path, 'w') as out: print('writing phrases to %s...' % out_path) for phrase_label, phrase in context.items(): for span, count in phrase.items(): out.write('%s\t%d\t%s\n' % (arg_to_a(phrase_label), count, span))
def _freeze_nested_defaultdict(d: defaultdict) -> dict: d = dict(d) for k, v in d.items(): if isinstance(v, defaultdict): d[k] = _freeze_nested_defaultdict(v) return d
def get_average_scores(directors: defaultdict): """Iterate through the directors dict (returned by get_movies_by_director), return a list of tuples (director, average_score) ordered by highest score in descending order. Only take directors into account with >= MIN_MOVIES""" return sorted( [(d, calc_mean_score(directors[d])) for d in directors.keys() if len(directors[d]) >= MIN_MOVIES], key=lambda x: -x[1])
def guess_symbol(frequency_map: defaultdict, frequency: float) -> str: s = '' diff = 100.0 for symbol, freq in frequency_map.items(): if abs(frequency - freq) < diff: diff = abs(frequency - freq) s = symbol return s
def scanning_error_rate(ticket_rules: defaultdict, tickets: List[List[int]]) -> List[List[int]]: """Calculates the scanning error rate = the product of all values invalid for any field""" error_rate = sum( number for ticket in tickets for number in ticket if all(number not in field_range for field_range in ticket_rules.values()) ) return error_rate
def findIsolatedEdges(self, d: defaultdict): isolated = [] for key in d.keys(): if (not d[key]): # print(" Isolated Node -> ", key) isolated.append(key) for i in isolated: print(" Isolated key -> ", i)
def get_statistics_info(self, main_dict: defaultdict) -> defaultdict: average_by_all = self.get_average_profit_by_all_enterprize(main_dict) statistic_dict = defaultdict(list) for key, item in main_dict.items(): if item <= average_by_all: statistic_dict["less_than_average"].append(key) else: statistic_dict["more_than_average"].append(key) return statistic_dict
def dfs_explore_words(exploring_node: defaultdict, word_so_far: str) -> None: for c in sorted(exploring_node.keys()): if c != self.end_of_word: if len(words_list) < max_recommendation: dfs_explore_words(exploring_node[c], word_so_far + c) else: words_list.append(word_so_far) if len(words_list) >= max_recommendation: return
def filter_words(histogram: defaultdict): filtered_words = [] for w, i in histogram.items(): if w.endswith('\'s') and w[:-2] in histogram: histogram[w[:-2]] += i filtered_words.append(w) elif w.endswith('s') and w[:-1] in histogram: histogram[w[:-1]] += i filtered_words.append(w) return filtered_words
def compare_default_dicts(a: defaultdict, b: defaultdict) -> bool: """Compare two defaultdicts, return True if equal, else False. Does a benign or soft compare. If the defaultdicts COULD become equal, they are considered equal. * Does NOT change the memory imprint of any of the dictionaries. * Any overlapping keys, must have same value. * Keys unique to one, must have the default value of the other. * Order of input does NOT matter. Example: a = defaultdict(lambda: "", a=42, b=42, c="") b = defaultdict(lambda: 42, c="", d="") compare_defaultdicts(a, b) -> True Parameters ---------- a : defaultdict Default dictionary from collections b : defaultdict: Default dictionary from collections Returns ------- bool : True if equal, else False """ a_keys = set(a) b_keys = set(b) a_unique_keys = (a_keys | b_keys) - b_keys b_unique_keys = (a_keys | b_keys) - a_keys # The intersecting keys must have the same value if not all(a[key] == b[key] for key in (a_keys & b_keys)): return False # Keys unique to one, must have default value of other. if not all(b.default_factory() == a[key] for key in a_unique_keys): return False if not all(a.default_factory() == b[key] for key in b_unique_keys): return False return True
def comb0(coins: list, deno: int, cur: defaultdict, ans: set): if deno < 0: return elif deno == 0: ans.add(tuple((key, val) for key, val in cur.items())) else: for coin in coins: cur[coin] += 1 comb0(coins, deno - coin, cur, ans) cur[coin] -= 1
def distanceBetweenNodes(self, start: str, end: str, graph: defaultdict): print(" start -> ", start, " end -> ", end, " graph -> ", graph) for k in graph.keys(): if (k == start): print(graph[k]) if (end in graph[k]): print(" end is availale -> ", end, " key -> ", k) return None
def print_hull_painting(pc: collections.defaultdict): min_x, min_y, max_x, max_y = 0, 0, 0, 0 for j in pc.keys(): min_x, min_y, max_x, max_y = min(min_x, j[0]), min(min_y, j[1]), max(max_x, j[0]), max(max_y, j[1]) x_range, y_range = max_x - min_x + 1, max_y - min_y + 1 x_shift, y_shift = -1 * min_x, -1 * min_y print(f"x: [{min_x}, {max_x}]; y: [{min_y}, {max_y}]") print(f"x_range: {x_range}; y_range: {y_range}") print(f"x_shift: {x_shift}; y_shift: {y_shift}") color_map = {0: " ", 1: "*"} data = [[color_map[0] for x in range(x_range)] for y in range(y_range)] # Put colors in data for j in pc.keys(): data[y_shift + j[1]][x_shift + j[0]] = color_map[pc[j]] print("----- Image -----") for j in reversed(range(y_range)): print("".join(data[j])) print("----- End Image -----")
def printFrequencies(Frequencies: defaultdict): sorted_dict = sorted(Frequencies.items(), key=lambda x: x[1], reverse=True) # O(n*log(n)) # for t in sorted_dict: # O(n) # print(t[0],"-> ",t[1]) # O(1) + O(1) + O(1) + O(1) = O(1) print(sorted_dict) file = open("outputA.txt", "w") for t in sorted_dict: # O(n) s = t[0] + "-> " + str(t[1]) + '\n' file.write(s) # O(1) + O(1) + O(1) + O(1) = O(1) file.close()
def fill_board_manhattan_distance( board: np.array, manhattan_distance_dict: collections.defaultdict) -> np.array: board_copy = board.copy() for index, distances in manhattan_distance_dict.items(): min_distance = min(distances, key=lambda x: x['distance']) board_copy[index] = (min_distance['id'] if len([ distance for distance in distances if (distance['distance'] == min_distance['distance']) ]) == 1 else '') return board_copy
def visualize(title: str, series: defaultdict, filename: str, per_continent: bool) -> None: if per_continent: world = pygal.maps.world.SupranationalWorld() else: world = pygal.maps.world.World() world.title = title for s in series.items(): world.add(*s) world.render_to_file(f'images/{filename}.svg')
def dict_sum(a: defaultdict, b: dict, inplace: bool = True) -> defaultdict: """ Calculate a + b, key-by-key. """ if not inplace: result = a.copy() else: result = a for key, value in b.items(): result[key] += value return result
def defaultdict_to_dict(dictionary: defaultdict) -> Dict: """Recursively convert nested :obj:`defaultdict` to :obj:`dict`. Args: dictionary: A defaultdict. Returns: The defaultdict as a :obj:`dict`. """ if isinstance(dictionary, defaultdict): dictionary = {k: defaultdict_to_dict(v) for k, v in dictionary.items()} return dictionary
def solution(n: int, islands: defaultdict, xs: defaultdict): ans = 0 bit = BIT(n + 1) keys = sorted(islands.keys(), reverse=True) for key in keys: for x in sorted(islands[key]): ans += bit.query(1, xs[x]) bit.update(xs[x], 1) return ans
def _fetch_broker_info(ctr_relation_buf: defaultdict): """Fetch broker information.""" ctr_list = list() default_policy = {"broker_id": -1, "explorer_id": -1} for _broker, _explorer in ctr_relation_buf.items(): default_policy.update({ "broker_id": _broker, "explorer_id": list(_explorer) }) ctr_list.append(default_policy.copy()) return ctr_list
def get_embeddings_per_block(data: defaultdict, model: fasttext.FastText, with_timedelta: bool) -> List: # create embeddings per block but at first remove '\n' (newline character) from the end if with_timedelta: embeddings = get_embeddings_with_timedeltas_per_block(data, model) else: embeddings = [ np.asarray( [model.get_sentence_vector(log.rstrip()) for log in logs]) for logs in data.values() ] return embeddings
def generate_table_rows(request_count: Counter, request_times:defaultdict) -> List[Dict]: table_rows = [] time_total = sum(chain(*request_times.values())) count_total = sum(request_count.values()) for url, times in request_times.items(): count = request_count[url] times = request_times[url] time_sum = sum(times) table_rows.append({'count': count, 'url': url, 'count_perc': round(100 * count / count_total, 2), 'time_perc': round(100 * time_sum / time_total, 2), 'time_sum': round(time_sum, 2), 'time_avg': round(sum(times) / len(times), 2), 'time_max': round(max(times), 2), 'time_med': round(median(times), 2)}) table_rows = sorted(table_rows, key=lambda x: x['time_sum'], reverse=True) return table_rows
def find_users(users: defaultdict, city_id, cities, needed_cities): response = pool.get_next_api().users.search(city = city_id , count = 1000 , v="5.44" ) count = response["count"] if count == 0: print ("no users") return # Если города ещё нет в базе пользователей if users.get(city_id) == None: users[city_id] = defaultdict() users[city_id]["users"] = set() users[city_id]["count"] = 0 # Если скачано меньше 95% людей из этого города. if abs(count - len(users[city_id]["users"])) * 100 / count > 5: print( "percent: " , abs(count - len(users[city_id]["users"])) * 100 / count) # Если в базе есть люди из этого города if len(users[city_id]["users"]) != 0: users_before = set(users[city_id]["users"]) else: users_before = set() # Добавляем людей, полученных в результате запроса в базу и в users_before for item in response["items"]: users_before.add(item["id"]) users[city_id]["users"].add(item["id"]) for user_id in users_before: try: response = pool.get_next_api().friends.get( user_id = user_id , order = "random" , fields = "city" , v="5.44") except vk.exceptions.VkAPIError as err: print(err) users[city_id]["users"].discard(user_id) continue add_users(users, response["items"], needed_cities) print("users in ", cities[city_id]["title"], " now: ", len(users[city_id]["users"])) # print(response["items"]) dumpData(users,usersFile) if (abs(count - len(users[city_id]["users"])) * 100 / count < 5) or (len(users[city_id]["users"]) > count): print("less then 3% ") break sleep(randint(1,3)) else: print ("City is exists")
def get_cities(cities: defaultdict, region_id: int, users, needed_cities): for city_tup in sorted(cities.items(), key=lambda y : y[1]["uc"], reverse=True): city = city_tup[1] city_id = city["id"] if city["uc"] == -1: cities[city_id]["uc"] = get_city_users_count(city_id) city["uc"] = cities[city_id]["uc"] dumpData(cities, citiesFile) if city["uc"] > 5000: print(city["id"], ": ", city["title"], " - ", city["uc"]) needed_cities.add(city_id) if users.get(city_id) == None: users[city_id] = defaultdict() users[city_id]["users"] = set() users[city_id]["count"] = 0 dumpData(users,usersFile) find_users(users, city["id"], cities, needed_cities) dumpData(users,usersFile) sleep(randint(1,3)) return cities
def unique_words(hist: defaultdict): return len(hist.keys())
#input1 = [i.strip().split('\\')[-1][:-5] for i in open('soda_pt_rice').readlines()] label = test_label name = [] for i in input1: s = re.findall('(?i)[a-z]{2,}',i) name.append(' '.join(s)) cv = CV(analyzer='char_wb', ngram_range=(3,4)) test_fn = cv.fit_transform(name).toarray() #test_fd = test_fn fold = 10 kf = KFold(len(test_fn), n_folds=fold, shuffle=True) iteration = 100 #lr_ = LR() #clf for use lr_ = SVC(kernel='linear', probability=True) CI = DD() #confidence level for each oracle acc_ = [[] for i in range(iteration)] #acc in each run for averaging for train, test in kf: fd_ = [] label_ = [] #TBD: randomly pick two examples from diff classes as starting fd_.append(train[0]) label_.append(test_label[train[0]]) train = train[1:] #needs one more ex from a diff class tmp = 0 for i in train: if test_label[i] == label_[0]: continue else: fd_.append(i)
frenchTags = [f.strip() for f in frenchTagset if f.strip() not in commonTags] engTags = [e.strip() for e in engTagset if e.strip() not in commonTags] frenchTagset.close() engTagset.close() print "Tagsets Loaded!!" enLines = [l.strip() for l in enData] frLines = [f.strip() for f in frData] alignLines = [a.strip() for a in alignFile] print "Pure Data and Alignments Loaded!!" contextDict = DD(int) testIndices = randomSampleTrain() print "Train and Test separated!!" trainIndices = [i for i in range(len(enLines)) if i not in testIndices] print "Starting Dictionary fillup!!" for k in range(1): count = 0 for i in trainIndices:
class UtilMultiFile(UtilObject): """ Keeps specified number of files opened, for read or write Attributes: mode - read or write maxCount - maximum number of opened files at any given moment hitCount - number of open file hits xactCount - number of transactions (reads or writes) fileList - list of open file names, sorted by the time fileDict - map of file name to a file handle """ def __init__(self, maxCount, mode): self.maxCount = maxCount self.mode = mode self.fileList = [] self.fileDict = {} self.fileCache = DefDict(list) self.hitCount = 0 self.xactCount = 0 def write(self, fileName, line): # Try to cache it first lines = self.fileCache[fileName] lines.append(line) if len(lines) > 100: self.cacheFlush(fileName) def cacheFlush(self, fileName): assert(self.mode[0] in ('w', 'a')) f = self.fileHandle(fileName) for l in self.fileCache[fileName]: try: f.write(l) except IOError as e: print("Could not write to %s: error %d %s" % ( fileName, e.errno, e.strerror)) return self.fileCache[fileName] = [] self.xactCount += 1 def fileHandle(self, fileName): if fileName not in self.fileDict: if len(self.fileList) == self.maxCount: oldFileName = self.fileList[0] self.fileDict[oldFileName].close() del self.fileDict[oldFileName] self.fileList = self.fileList[1:] try: f = open(fileName, self.mode) except IOError as e: print("Could not open %s: error %d %s" % (fileName, e.errno, e.strerror)) return None self.fileDict[fileName] = f self.fileList.append(fileName) else: self.hitCount += 1 return self.fileDict[fileName] def closeAll(self): for fileName in self.fileCache.keys(): self.cacheFlush(fileName) for f in self.fileDict.values(): f.close() self.fileDict = {} self.fileList = [] def getStats(self): return "%u hits out of %u transactions: %u%%" % (self.hitCount, self.xactCount, (100 * self.hitCount / self.xactCount) if self.xactCount else 0)
cogRegInt = calculateCogRegInt(cogReg) cogDict, cogWeightDictList, taxaDict, taxDist = \ buildCogTaxaDict(interpolationRange=range(cogRegInt, cogRegInt+2)) cogDist = buildCogDistances(cogDict, cogWeightDictList, **CogDistOptimalParams) corr, std = calculateCorrelation(cogDist, taxDist) print("CORRELATION: %f STD: %f" % (corr, std)) print("\nStoring COG distance dictionary...") UtilStore(cogDist, COG_DIST_DICT()) sys.exit(0) if (len(sys.argv) == 2) and (sys.argv[1] == "distCounts"): print("Building dict of taxonomy dist counts...") _, _, taxaDict, taxDist = \ buildCogTaxaDict(noWeights = True) genTaxDistCntDict = DefDict(lambda: [0] * (TaxaType.maxDistance() + 1)) for dir, tdd in taxDist.items(): for d in tdd.values(): genTaxDistCntDict[dir][d] += 1 UtilStore(genTaxDistCntDict, GENOME_TAX_DIST_CNT_DICT()) ttTaxDistCntDict = {} for dir, l in genTaxDistCntDict.items(): ttTaxDistCntDict[taxaDict[dir].type.key] = l UtilStore(ttTaxDistCntDict, TAXTYPE_TAX_DIST_CNT_DICT()) sys.exit(0) print("WRONG COMMAND LINE")
def buildCogTaxaDict(noWeights = False, showCogFreqHist = False, interpolationRange = None): print("reading taxa dictionary...") taxaDict = UtilLoad(PROK_TAXA_DICT()) print("Read %d organisms" % len(taxaDict)) print("Reading cogDict...") cogDict = UtilLoad(COG_DICT()) print("Building COG frequncies...") cogFreq = DefDict(int) for dir, cogs in cogDict.iteritems(): for cname in cogs: cogFreq[cname] += 1 if showCogFreqHist: print("Sowing cogFreq histogram...") UtilDrawHistogram(cogFreq.values(), show = True) temp = taxaDict.keys() for dir in temp: if dir not in cogDict: del taxaDict[dir] temp = cogDict.keys() for dir in temp: if dir not in taxaDict: del cogDict[dir] print("Valid set contains %d organisms" % len(cogDict)) print("\nBuilding Taxonomy distances...") taxDist = DefDict(dict) for dir1, taxa1 in taxaDict.items(): for dir2, taxa2 in taxaDict.items(): d = taxa1.distance(taxa2) taxDist[dir1][dir2] = d # Optimization if noWeights: return (cogDict, None, taxaDict, taxDist) fname = COG_WEIGHTS_DICT_LIST() if os.path.isfile(fname): print("Loading cogWeightDictList...") cogWeightDictList = UtilLoad(fname, progrIndPeriod=100) else: print("Building cogWeightsDict...") cogWeightDictList = [DefDict(dict) for i \ in range(0, COG_REG_STEP_COUNT+1)] if not interpolationRange: interpolationRange = range(0, COG_REG_STEP_COUNT+1) for i in interpolationRange: expCogReg = math.exp(COG_REG_LOWER + float(i) * COG_REG_STEP) print("\nexpCogReg %f" % expCogReg) cogWeightDict = cogWeightDictList[i] for ind, (dir1, cogs1) in enumerate(cogDict.iteritems(), start=1): print("\r%d.%d. %s" % (i, ind, dir1)), for dir2, cogs2 in cogDict.iteritems(): cogWeightDict[dir1][dir2] = \ cogSetWeight(cogs1 & cogs2, cogFreq, expCogReg) print UtilStore(cogWeightDictList, fname) return (cogDict, cogWeightDictList, taxaDict, taxDist)
train_fn = fn[train] #n_class = len(np.unique(label[train])) n_class = 30 ''' c = AC(n_clusters=n_class, affinity='cosine', linkage='average') c.fit(train_fd) tmp = dd(list) for i,j in zip(c.labels_, train): tmp[i].append(j) for k,v in tmp.items(): for vv in v: print k, input3[vv] ''' c = KMeans(init='k-means++', n_clusters=n_class, n_init=10) c.fit(train_fn) tmp = DD(list) for i,j in zip(c.labels_,train): tmp[i].append(j) for k,v in tmp.items(): for vv in v: pass #print k, input1[vv] ex = DD(list) dist = np.sort(c.transform(train_fn)) for i,j,k in zip(c.labels_,train,dist): ex[i].append([j,k[0]]) print i,k[0],input3[j] for i,j in ex.items(): ex[i] = sorted(j, key=lambda x: x[-1])
def word_search(key: str, hist: defaultdict): freq = 0 for v, k in hist.items(): if v == key: return k return freq
def average_frequency(hist: defaultdict): sum = 0 for freq in hist.values(): sum += int(freq) return sum/len(hist.values())
cogDict, cogFreq, cogWeightDict, taxaDict = ccm.buildCogTaxaDict() print ("cogDict len %d, taxaDict len %d" % (len(cogDict), len(taxaDict))) dirCorrDict = UtilLoad(GENOME_CORR_DICT()) print ("dirCorrDict len %d" % len(dirCorrDict)) print("Building COG distances...") cogDist = DefDict(dict) for ordinal, (dir1, cs1) in enumerate(cogDict.iteritems(), start = 1): print("\r%d. %s" % (ordinal, dir1)), for dir2, cs2 in cogDict.iteritems(): cogDist[dir1][dir2] = cogDistFunc(cs1, cs2) print("\nBuilding average distances for TaxaTypes...") # Genome dir -> dict of {taxaTypes -> avg COG distance to dir} dirTaxaTypeDictDict = DefDict(lambda: DefDict(list)) for ordinal, dir1 in enumerate(taxaDict.keys(), start = 1): print("\r%d. %s" % (ordinal, dir1)), for dir2, taxa in taxaDict.iteritems(): dirTaxaTypeDictDict[dir1][repr(taxa.type)].append(cogDist[dir1][dir2]) print("\nRebuilding dirTaxaTypeDictDict to get UtilNormDistribs...") for dir, d in dirTaxaTypeDictDict.iteritems(): # Find global weighted STD std = 0. totalLen = 0 for taxaTypeStr, distList in d.iteritems(): if len(distList) >= 2: val = np.std(distList, ddof = 1.) std += val * val * len(distList) totalLen += len(distList)
label = test_label class_ = np.unique(train_label) name = [] for i in input1: s = re.findall('(?i)[a-z]{2,}',i) name.append(' '.join(s)) cv = CV(analyzer='char_wb', ngram_range=(3,4)) test_fn = cv.fit_transform(name).toarray() for b in bl: print b.score(test_fd,label) n_class = 32/2 c = KMeans(init='k-means++', n_clusters=n_class, n_init=10) c.fit(test_fn) dist = np.sort(c.transform(test_fn)) ex = DD(list) #example id, distance to centroid ex_id = DD(list) #example id for each C ex_N = [] #number of examples for each C for i,j,k in zip(c.labels_, xrange(len(test_fn)), dist): ex[i].append([j,k[0]]) ex_id[i].append(int(j)) for i,j in ex.items(): ex[i] = sorted(j, key=lambda x: x[-1]) ex_N.append([i,len(ex[i])]) ex_N = sorted(ex_N, key=lambda x: x[-1],reverse=True) #sort cluster by density nb_c = DD() for exx in ex_id.values(): exx = np.asarray(exx) for e in exx: nb_c[e] = exx[exx!=e] nb_f = [DD(), DD(), DD()]
# Build a tree of TaxaTypes taxaTypeTree = TaxaTypeTree(taxaDict) # Set of all Taxa types on all levels allTaxaTypes = taxaTypeTree.getAllTypesSet() print("Length of allTaxaTypes %d" % len(allTaxaTypes)) # Build a dictionary: [dir][taxaType] -> UtilObject(mean, std, # isAncest, distList), where # mean - mean distance between this dir and all [other] dirs in this taxaType # std - standard deviation, if applicable # distList - list of all the distances print("Building taxaTypeDistDict...") taxaTypeDistDict = DefDict(dict) taxaTypeAncDistDict = DefDict(lambda: [None] * (TaxaType.hierarchySize()+1)) for ind, dir in enumerate(taxaDict, start = 1): print("\r%u. %s" % (ind, dir)), d = taxaTypeDistDict[dir] listAnc = taxaTypeAncDistDict[dir] cogDistForDir = cogDist[dir] for currType in allTaxaTypes: dirs = taxaTypeTree.getDirSet(currType) if dir in dirs: ancestorList = True dirs.remove(dir) else: ancestorList = False distList = [cogDistForDir[x] for x in dirs] meanVal = 0.0 stdVal = 0.0
c = KMeans(init='k-means++', n_clusters=n_class, n_init=10) c.fit(c_fn) ''' c = DPGMM(n_components=50, covariance_type='diag', alpha=1) c.fit(train_fd) c_labels = c.predict(train_fd) print '# of GMM', len(np.unique(c_labels)) mu = c.means_ cov = c._get_covars() c_inv = [] for co in cov: c_inv.append(np.linalg.inv(co)) e_pr = np.sort(c.predict_proba(train_fd)) ''' dist = np.sort(c.transform(c_fn)) ex = DD(list) #example id, distance to centroid ex_id = DD(list) #example id for each C ex_N = [] #number of examples for each C #for i,j,k in zip(c_labels, train, e_pr): for i,j,k in zip(c.labels_, train, dist): ex[i].append([j,k[0]]) ex_id[i].append(int(j)) for i,j in ex.items(): ex[i] = sorted(j, key=lambda x: x[-1]) ex_N.append([i,len(ex[i])]) ex_N = sorted(ex_N, key=lambda x: x[-1],reverse=True) #sort cluster by density #confidence of training ex label_pr = np.sort(md.predict_proba(data2[train])) cf_d = DD() for i,pr in zip(train, label_pr):