def spacy_stats(caption): doc = nlp(caption) tokens = [token for token in doc] POS = ["POS_" + token.pos_ for token in tokens] tags = ["TAG_" + token.tag_ for token in tokens] ents = ["ENT_" + ent.label_ for ent in doc.ents] is_blank = { k: sum(getattr(token, k) for token in tokens) for k in [ "is_digit", "is_lower", "is_upper", "is_title", "is_punct", "is_currency", "like_num", "is_oov", "is_stop", ] } return { "num_stop": sum(t.is_stop for t in tokens), "num_alpha": sum(t.is_alpha for t in tokens), "num_tokens": len(tokens), "num_noun_chunks": len(list(doc.noun_chunks)), "num_words": len(doc), **toolz.frequencies(POS), **toolz.frequencies(tags), **toolz.frequencies(ents), **is_blank, }
def get_citation_histograms(identifiers, data=None): ch = {} current_year = datetime.now().year # Get necessary data if nothing was provided if not data: data = get_citations(identifiers) if len(data) == 0: data = get_citations(identifiers, no_zero=False) years = [int(p.bibcode[:4]) for p in data] # First gather all necessary data # refereed -> refereed rr_data = [([int(c[:4]) for c in p.refereed_citations], 1.0 / float(p.author_num)) for p in data if p.refereed] # refereed -> non-refereed rn_data = [([int(c[:4]) for c in p.citations if c in p.refereed_citations], 1.0 / float(p.author_num)) for p in data if not p.refereed] # non-refereed -> refereed nr_data = [([int(c[:4]) for c in list(set(p.citations).difference( set(p.refereed_citations)))], 1.0 / float(p.author_num)) for p in data if p.refereed] # non-refereed -> non-refereed nn_data = [([int(c[:4]) for c in p.citations if c not in p.refereed_citations], 1.0 / float(p.author_num)) for p in data if not p.refereed] # First construct the regular histograms rr_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in rr_data]))) rn_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in rn_data]))) nr_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in nr_data]))) nn_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in nn_data]))) # Get the earliest citation try: min_year = min( rr_hist.keys() + rn_hist.keys() + nr_hist.keys() + nn_hist.keys()) nullhist = [(y, 0) for y in range(min_year, current_year + 1)] except: nullhist = [(y, 0) for y in range(min(years), current_year + 1)] # Now create the histograms with zeroes for year without values ch['refereed to refereed'] = merge_dictionaries(dict(nullhist), rr_hist) ch['refereed to nonrefereed'] = merge_dictionaries(dict(nullhist), rn_hist) ch['nonrefereed to refereed'] = merge_dictionaries(dict(nullhist), nr_hist) ch['nonrefereed to nonrefereed'] = merge_dictionaries( dict(nullhist), nn_hist) min_year = min(ch['refereed to refereed'].keys() + ch['refereed to nonrefereed'].keys() + ch['nonrefereed to refereed'].keys() + ch['nonrefereed to nonrefereed'].keys()) nullhist = [(y, 0) for y in range(min_year, current_year + 1)] # Normalized histograms need a different approach tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in rr_data])) ch['refereed to refereed normalized'] = get_norm_histo(nullhist + tmp) tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in rn_data])) ch['refereed to nonrefereed normalized'] = get_norm_histo(nullhist + tmp) tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in nr_data])) ch['nonrefereed to refereed normalized'] = get_norm_histo(nullhist + tmp) tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in nn_data])) ch['nonrefereed to nonrefereed normalized'] = get_norm_histo( nullhist + tmp) return ch
def prep_work(self, test_files, develop_files, train_files, threshold): #Reduce DID to language-specific samples if self.type == "DID": test_files = [x for x in test_files if x[0] == self.language] develop_files = [x for x in develop_files if x[0] == self.language] train_files = [x for x in train_files if x[0] == self.language] #Filter by number of samples country_list = [x[-1] for x in train_files] starting = len(set(country_list)) country_dict = ct.frequencies(country_list) country_threshold = lambda x: x >= threshold country_dict = ct.valfilter(country_threshold, country_dict) country_list = list(country_dict.keys()) print("\t\tReducing initial set of " + str(starting) + " countries to " + str(len(country_list)) + " after frequency threshold.") #Prune and shuffle file lists test_files = [x for x in test_files if x[-1] in country_list] shuffle(test_files) train_files = [x for x in train_files if x[-1] in country_list] shuffle(train_files) develop_files = [x for x in develop_files if x[-1] in country_list] shuffle(develop_files) return test_files, develop_files, train_files, country_list elif self.type == "LID": #Filter by number of samples lang_list = [x[0] for x in train_files] starting = len(set(lang_list)) lang_dict = ct.frequencies(lang_list) lang_threshold = lambda x: x >= threshold lang_dict = ct.valfilter(lang_threshold, lang_dict) lang_list = list(lang_dict.keys()) print("\t\tReducing initial set of " + str(starting) + " languages to " + str(len(lang_list)) + " after frequency threshold.") #Prune and shuffle file lists test_files = [x for x in test_files if x[0] in lang_list] shuffle(test_files) train_files = [x for x in train_files if x[0] in lang_list] shuffle(train_files) develop_files = [x for x in develop_files if x[0] in lang_list] shuffle(develop_files) return test_files, develop_files, train_files, lang_list
def data_description(self, y_dev): freqs = ct.frequencies(y_dev) for i in range(len(self.y_encoder.classes_)): print("\t", end = "") print(self.y_encoder.classes_[i], freqs[i])
def process_file(self, filename, delta_threshold = 0.05, freq_threshold = 1, save = True): candidates = [] starting = time.time() #Initialize Beam Search class BS = BeamSearch(delta_threshold, self.association_dict) for line in self.Encoder.load_stream(filename): if len(line) > 2: #Beam Search extraction candidates += BS.beam_search(line) #Count each candidate, get dictionary with candidate frequencies candidates = ct.frequencies(candidates) print("\t" + str(len(candidates)) + " candidates before pruning.") #Reduce nonce candidates above_zero = lambda x: x > freq_threshold candidates = ct.valfilter(above_zero, candidates) #Print time and number of remaining candidates print("\t" + str(len(candidates)) + " candidates in " + str(time.time() - starting) + " seconds.") if save == True: self.Loader.save_file(candidates, filename + ".candidates.p") return os.path.join(self.Loader.output_dir, filename + ".candidates.p") else: return candidates
def calc_stats(dp: dataset_pb2.DataPoint) -> pd.DataFrame: items = [] signal = np.array(dp.signal) items.append(("Signal length", len(signal))) items.append(("Signal min value", np.min(signal))) items.append(("Signal median value", np.median(signal))) items.append(("Signal max value", np.max(signal))) items.append(("Signal value std", np.std(signal))) items.append(("Basecalled length", len(dp.basecalled))) items.append(("Reference length", len(dp.aligned_ref))) occ = toolz.frequencies(dp.cigar) items.append(( "Match Rate", occ.get(dataset_pb2.MATCH, 0) / len(dp.aligned_ref), )) items.append(( "Mismatch Rate", occ.get(dataset_pb2.MISMATCH, 0) / len(dp.aligned_ref), )) items.append(( "Insertion Rate", occ.get(dataset_pb2.INSERTION, 0) / len(dp.aligned_ref), )) items.append(( "Deletion Rate", occ.get(dataset_pb2.DELETION, 0) / len(dp.aligned_ref), )) items.append(("Signal sample/bases", len(signal) / len(dp.basecalled))) return pd.DataFrame(items, columns=("Attribute", "Value"))
def print_labels(df, labels): """ Print an inventory of labels counts, and return it as a dictionary :param df: :param labels: :return: """ return ct.frequencies(df.loc[:, labels])
def build_vocab(tokenized_texts, min_occur_count): word_counts = cytoolz.frequencies(w for doc in tokenized_texts for w in doc.lower().split()) word_counts = cytoolz.valfilter(lambda v: v >= min_occur_count, word_counts) vocab, counts = zip( *sorted(word_counts.items(), key=operator.itemgetter(1), reverse=True)) vocab = list(vocab) counts = np.array(counts) return vocab, counts
def get_publication_histograms(identifiers): ph = {} current_year = datetime.now().year # Get necessary data data = get_publication_data(identifiers) # Get the publication histogram years = [int(p.bibcode[:4]) for p in data] nullhist = [(y, 0) for y in range(min(years), current_year + 1)] yearhist = cy.frequencies(years) ph['all publications'] = merge_dictionaries(dict(nullhist), yearhist) years_ref = [int(p.bibcode[:4]) for p in data if p.refereed] yearhist = cy.frequencies(years_ref) ph['refereed publications'] = merge_dictionaries(dict(nullhist), yearhist) # Get the normalized publication histogram tmp = [(int(p.bibcode[:4]), 1.0 / float(p.author_num)) for p in data] ph['all publications normalized'] = get_norm_histo(nullhist + tmp) tmp = [(int(p.bibcode[:4]), 1.0 / float(p.author_num)) for p in data if p.refereed] ph['refereed publications normalized'] = get_norm_histo(nullhist + tmp) return ph
def simulate_counts(p, C, phys_dim=2, seed=None): """Simulate measuring each qubit of ``p`` in the computational basis, producing output like that of ``qiskit``. Parameters ---------- p : vector or operator The quantum state, assumed to be normalized, as either a ket or density operator. C : int The number of counts to perform. phys_dim : int, optional The assumed size of the subsystems of ``p``, defaults to 2 for qubits. Returns ------- results : dict[str, int] The counts for each bit string measured. Examples -------- Simulate measuring the state of each qubit in a GHZ-state: .. code:: python3 >>> import quimb as qu >>> psi = qu.ghz_state(3) >>> qu.simulate_counts(psi, 1024) {'000': 514, '111': 510} """ if seed is not None: np.random.seed(seed) n = infer_size(p, phys_dim) d = phys_dim**n if isop(p): pi = np.diag(p).real else: pi = np.multiply(np.conj(p), p).real # probability of each basis state pi = pi.reshape(-1) # raw counts in terms of integers raw_counts = np.random.choice(np.arange(d), size=C, p=pi) # convert to frequencies of binary bin_str = '{:0>' + str(n) + 'b}' results = keymap(bin_str.format, frequencies(raw_counts)) return results
def doc_features(doc): doc_words = cytoolz.frequencies(cm.filter_sw(doc)) # initialize to 0 features = zero_features.copy() word_matches = match(doc_words, word_features) for word in word_matches: features[word] = doc_words[word] return features
def doc_features(doc): doc_words = cytoolz.frequencies(cm.filter_sw(doc)) # initialize to 0 features = zero_features.copy() word_matches = match(doc_words, word_features) for word in word_matches: features[word] = (doc_words[word]) return features
def _mk_fork_configuration_params(fork_config): all_block_numbers = tuple(fork_config.values()) if len(all_block_numbers) != len(set(all_block_numbers)): duplicates = tuple( sorted(blk_num for blk_num, freq in frequencies(all_block_numbers).items() if freq > 1)) raise ValueError("Duplicate block numbers: {0}".format(duplicates)) args = {(block_number, FORK_NAME_MAPPING[fork_name]) for fork_name, block_number in fork_config.items() if (block_number is not None and fork_name != FORK_DAO)} if FORK_DAO in fork_config: kwargs = {'dao_start_block': fork_config[FORK_DAO]} else: kwargs = {} return args, kwargs
def process_file(self, filename, delta_threshold=0.05, freq_threshold=1, save=True): candidates = [] starting = time.time() #Initialize Beam Search class BS = BeamSearch(delta_threshold, self.association_dict) for line in self.Encoder.load_stream(filename): if len(line) > 2: #Beam Search extraction candidates += BS.beam_search(line) #Count each candidate, get dictionary with candidate frequencies candidates = ct.frequencies(candidates) print("\t" + str(len(candidates)) + " candidates before pruning.") #Reduce nonce candidates above_zero = lambda x: x > freq_threshold candidates = ct.valfilter(above_zero, candidates) #Print time and number of remaining candidates print("\t" + str(len(candidates)) + " candidates in " + str(time.time() - starting) + " seconds.") if save == True: self.Loader.save_file(candidates, filename + ".candidates.p") return os.path.join(self.Loader.output_dir, filename + ".candidates.p") else: return candidates
def get_citation_histograms(identifiers, data=None): ch = {} current_year = datetime.now().year # Get necessary data if nothing was provided if not data: data = get_citations(identifiers) if len(data) == 0: data = get_citations(identifiers, no_zero=False) years = [int(p.bibcode[:4]) for p in data] # First gather all necessary data # refereed -> refereed rr_data = [([int(c[:4]) for c in p.refereed_citations], 1.0 / float(p.author_num)) for p in data if p.refereed] # refereed -> non-refereed rn_data = [([int(c[:4]) for c in p.citations if c in p.refereed_citations], 1.0 / float(p.author_num)) for p in data if not p.refereed] # non-refereed -> refereed nr_data = [([ int(c[:4]) for c in list(set(p.citations).difference(set(p.refereed_citations))) ], 1.0 / float(p.author_num)) for p in data if p.refereed] # non-refereed -> non-refereed nn_data = [ ([int(c[:4]) for c in p.citations if c not in p.refereed_citations], 1.0 / float(p.author_num)) for p in data if not p.refereed ] # First construct the regular histograms rr_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in rr_data]))) rn_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in rn_data]))) nr_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in nr_data]))) nn_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in nn_data]))) # Get the earliest citation try: min_year = min(rr_hist.keys() + rn_hist.keys() + nr_hist.keys() + nn_hist.keys()) nullhist = [(y, 0) for y in range(min_year, current_year + 1)] except: nullhist = [(y, 0) for y in range(min(years), current_year + 1)] # Now create the histograms with zeroes for year without values ch['refereed to refereed'] = merge_dictionaries(dict(nullhist), rr_hist) ch['refereed to nonrefereed'] = merge_dictionaries(dict(nullhist), rn_hist) ch['nonrefereed to refereed'] = merge_dictionaries(dict(nullhist), nr_hist) ch['nonrefereed to nonrefereed'] = merge_dictionaries( dict(nullhist), nn_hist) min_year = min(ch['refereed to refereed'].keys() + ch['refereed to nonrefereed'].keys() + ch['nonrefereed to refereed'].keys() + ch['nonrefereed to nonrefereed'].keys()) nullhist = [(y, 0) for y in range(min_year, current_year + 1)] # Normalized histograms need a different approach tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in rr_data])) ch['refereed to refereed normalized'] = get_norm_histo(nullhist + tmp) tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in rn_data])) ch['refereed to nonrefereed normalized'] = get_norm_histo(nullhist + tmp) tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in nr_data])) ch['nonrefereed to refereed normalized'] = get_norm_histo(nullhist + tmp) tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in nn_data])) ch['nonrefereed to nonrefereed normalized'] = get_norm_histo(nullhist + tmp) return ch
def ngram_counts(words, n, pad='<eos>'): """ generates a dictionary of ngram counts from a list of words. """ return frequencies(ngrams(words, n, pad))
def freq_dict(file_words): filtered = cm.filter_sw(file_words[1].split()) fd = cytoolz.frequencies(filtered) return fd
def build_ngram_model(sentences, n, pad='<eos>'): """ generates a dictionary of word-ngram counts from a list of sentences. """ return frequencies( concat(ngrams(sent, n, pad) for sent in sentences) )
def select_word_features(corpus): words = cytoolz.frequencies(corpus) sorted_words = sorted(words, key=words.get) N = int(.02 * len(sorted_words)) return sorted_words[-N:]
def frequencies(self): return fdict(cytoolz.frequencies(self))
noun1, preposition, noun2 = binary.split('-') l1 += list(df[df['object1'] == noun1].image) l2 += list(df[df['object2'] == noun2].image) l3 += list(df[df['preposition'] == preposition].image) l4 += list(df[( (df['object1'] == unary) | (df['object2'] == unary)) & (df['rcc'] == 'DC')].image) # l1 += list(df[(df['object1'] == noun1) & (df['rcc'].notnull())].image) # l2 += list(df[(df['object2'] == noun2) & (df['rcc'].notnull())].image) # l3 += list(df[(df['preposition'] == preposition) & (df['rcc'].notnull())].image) # l4 += list(df[((df['object1'] == unary) | (df['object2'] == unary)) & (df['rcc'] == 'DC')].images) retrieved = { k: v / weights[k] for k, v in cytoolz.frequencies(l1 + l2 + l3 + l4).items() } valids = [ (k, retrieved[k]) for k in sorted(retrieved, key=retrieved.get, reverse=True) if retrieved[k] >= 2.5 ] retrieved = [] relevance = [] if valids: retrieved, relevance = zip(*valids) gs = [ imagenames[idx] for idx, is_valid in enumerate(query['rank']) if is_valid ]
def alignment_stats(lable_ind, label_val, pred_ind, pred_val, batch_size, debug=False): """Returns a list of numpy array representing alignemnt stats. First N elements are in aligment_stats_ordering and the last one in identity. The return is like this due to tf.py_func requirements --> this function is made for embedding as tf operation via tf.py_func :param lable_ind: :param label_val: :param pred_ind: :param pred_val: :param batch_size: :param debug: :return: """ prefix = os.environ.get("MINCALL_LOG_DATA", None) if prefix: fname = os.path.abspath(os.path.join(prefix, f"{uuid.uuid4().hex}.npz")) with open(fname, "wb") as f: np.savez( f, **{ "label_val": label_val, "lable_ind": lable_ind, "pred_val": pred_val, "pred_ind": pred_ind, "batch_size": batch_size, }) logger.debug(f"Saves alignment stats input data to {fname}") yt = defaultdict(list) for ind, val in zip(lable_ind, label_val): yt[ind[0]].append(val) yp = defaultdict(list) for ind, val in zip(pred_ind, pred_val): yp[ind[0]].append(val) sol = defaultdict(list) identities = [] for x in range(batch_size): query = decode(np.array(yp[x], dtype=int)) target = decode(np.array(yt[x], dtype=int)) if len(target) == 0: raise ValueError("Empty target sequence") if len(query) == 0: logger.warning(f"Empty query sequence\n" f"Target: {target}") sol[dataset_pb2.MATCH].append(0.0) sol[dataset_pb2.MISMATCH].append(0.0) sol[dataset_pb2.DELETION].append(1.0) sol[dataset_pb2.INSERTION].append(0.0) identities.append(0) continue edlib_res = edlib.align(query, target, task='path') stats = ext_cigar_stats(edlib_res['cigar']) read_len = stats[dataset_pb2.MISMATCH] + stats[ dataset_pb2.MATCH] + stats[dataset_pb2.INSERTION] # https://github.com/isovic/samscripts/blob/master/src/errorrates.py identities.append(stats[dataset_pb2.MATCH] / sum(stats.values())) for op in aligment_stats_ordering: sol[op].append(stats[op] / read_len) if True: msg = "edlib results\n" s_query, s_target, _ = squggle(query, target) exp_cigar = expand_cigar(edlib_res['cigar']) for i in range(0, len(s_query), 80): msg += "query: " + s_query[i:i + 80] + "\n" msg += "target: " + s_target[i:i + 80] + "\n" msg += "cigar : " + exp_cigar[i:i + 80] + "\n" msg += "--------" + 80 * "-" + "\n" msg += "query: " + query + "\n" msg += "target: " + target + "\n" msg += "full cigar: " + edlib_res['cigar'] + "\n" msg += pformat( {dataset_pb2.Cigar.Name(k): v for k, v in stats.items()}) + "\n" msg += "readl: " + str(read_len) + "\n" df = pd.DataFrame({ "query": toolz.merge( toolz.frequencies(query), toolz.keymap( "".join, toolz.frequencies(toolz.sliding_window(2, query))), ), "target": toolz.merge( toolz.frequencies(target), toolz.keymap( "".join, toolz.frequencies(toolz.sliding_window(2, target))), ), }) df["delta"] = 100 * (df['target'] / df['query'] - 1) df = df[['query', 'target', 'delta']] msg += "Stats\n" + str(df) + "\n" msg += "==================\n" logger.info(msg) sol = [ np.array(sol[op], dtype=np.float32) for op in aligment_stats_ordering ] sol_data = { dataset_pb2.Cigar.Name(k): v for k, v in zip(aligment_stats_ordering, sol) } sol_data["IDENTITY"] = identities logger.info(f"sol: \n{pd.DataFrame(sol_data)}") return sol + [np.array(identities, dtype=np.float32)]