def _set_parent_errors(self, trace): """ Set parent_error flag for all transactions in branches finished with error in trace Parameters ---------- trace : list List of transactions """ errors = {} for transaction in trace: if "error" in transaction.keys(): if transaction["transactionHash"] not in errors.keys(): errors[transaction["transactionHash"]] = trie.Trie() errors[transaction["transactionHash"]][ transaction["traceAddress"]] = True for transaction in trace: if transaction["transactionHash"] in errors.keys(): prefix_exists = bool( errors[transaction["transactionHash"]].shortest_prefix( transaction["traceAddress"])) is_node = errors[transaction["transactionHash"]].has_key( transaction["traceAddress"]) if prefix_exists and not is_node: transaction["parent_error"] = True
def __init__(self, args): self.paths = args.paths self.max_num = int(args.max_num) self.final_trie = tri.Trie() self.trie_per_file = [] self.__char_to_remove = re.compile('[,.!?*#();:\[\]{}]') self.count_words()
def build_trie(self): """ building Trie from piece """ Trie = pygtrie.Trie() for (key, score) in self.SentencePiece.get_pieces().items(): Trie[key] = (key, score) self.Trie = Trie
def __init__(self, language: str = "ru", mode: Mode = Mode.GRAPHEMES, raw_dict_path=None, trie_path=None, zalyzniak_dict=ZALYZNYAK_DICT, cmu_dict=CMU_DICT) -> None: self.data = pygtrie.Trie() # type: Dict[str, Set[Stress]] self.raw_dict_path = raw_dict_path self.trie_path = trie_path if language == "ru" and mode == self.Mode.GRAPHEMES: self.__init_defaults(RU_GRAPHEME_STRESS_PATH, RU_GRAPHEME_STRESS_TRIE_PATH) if not os.path.exists(self.raw_dict_path): from rupo.dict.zaliznyak import ZalyzniakDict ZalyzniakDict.convert_to_accent_only(zalyzniak_dict, self.raw_dict_path) elif mode == self.Mode.PHONEMES and language == "en": self.__init_defaults(EN_PHONEME_STRESS_PATH, EN_PHONEME_STRESS_TRIE_PATH) if not os.path.exists(self.raw_dict_path): CMUDict.convert_to_phoneme_stress(cmu_dict, self.raw_dict_path) else: assert False if not os.path.isfile(self.raw_dict_path): raise FileNotFoundError("Dictionary raw file not found.") if os.path.isfile(self.trie_path): self.load(self.trie_path) else: self.create(self.raw_dict_path, self.trie_path)
def _read_file(self, path): """ :param path: path of a file :return: Trie object with all counted word """ trie = tri.Trie() counted_file = '' with open(path, 'r', buffering=DEFAULT_BUFFER_SIZE) as f: in_time_frame = False for line in f: dates_string = utils.find_legal_date(line) if dates_string: try: date = list(datefinder.find_dates(dates_string))[0] except (IndexError, OverflowError): date = None if date is not None: in_time_frame = self._is_timestamps_in_time_frame( date, in_time_frame) if in_time_frame: if not counted_file: counted_file = path for word in line.split(): word = self.__char_to_remove.sub('', word.lower()) if word: if word in trie: trie[word] += 1 else: trie[word] = 1 return path, trie
def buildSearchTrie(self, choices): searchtrie = trie.Trie() for choice in choices: for token in self.tokenizeChoice(choice): if not searchtrie.has_key(token): searchtrie[token] = [] searchtrie[token].append(choice) return searchtrie
def get_phoenetic_trie(): dd = defaultdict(list) tree = trie.Trie() for k, vs in cmu.items(): for v in vs: dd[','.join(v)].append(k) for k, v in dd.items(): tree[k.split(',')] = v return tree, cmu
def create_trie(): tostring = (getattr(array.array, 'tobytes', None) or # Python 3 getattr(array.array, 'tostring')) # Python 3 trie = pygtrie.Trie() for x in range(100): y = tostring(array.array('h', range(x, 1000))) trie.update([(y, x)]) return trie
def __attrs_post_init__(self): self.words_queue = PriorityQueue(maxsize=0) if not self.words_tree: root = Path(__file__).parent word_list = (root / "words.txt").read_text().splitlines() self._words = [w.rstrip().upper() for w in word_list if 9 >= len(w.strip()) > 2] self.words_tree = pygtrie.Trie() for w in self._words: self.words_tree[w] = True
def __init__(self, args): self._time_frames = utils.merge_intervals(args.time_frames) self._max_num = int(args.max_num) self.final_trie = tri.Trie() self.__trie_per_file = [] self.__char_to_remove = re.compile('[=,.!?*#();:\[\]{}]') self._debug_mode = args.debug self._counted_word_files = set() self.count_words()
def build_trie(self, seq_iter): trie = pygtrie.Trie() N = self.max_prefix_len for seq in seq_iter: for start in xrange(len(seq)): key = tuple(seq[start:start + N]) trie[key] = 1 self.trie = trie return trie
def __init__(self, steps_per_second, num_velocity_bins, min_pitch, max_pitch, add_eos=False, ngrams=None): """Initialize a MidiPerformanceEncoder object. Encodes MIDI using a performance event encoding. Index 0 is unused as it is reserved for padding. Index 1 is unused unless `add_eos` is True, in which case it is appended to all encoded performances. If `ngrams` is specified, vocabulary is augmented with a set of n-grams over the original performance event vocabulary. When encoding, these n-grams will be replaced with new event indices. When decoding, the new indices will be expanded back into the original n-grams. No actual encoder interface is defined in Tensor2Tensor, but this class contains the same functions as TextEncoder, ImageEncoder, and AudioEncoder. Args: steps_per_second: Number of steps per second at which to quantize. Also used to determine number of time shift events (up to one second). num_velocity_bins: Number of quantized velocity bins to use. min_pitch: Minimum MIDI pitch to encode. max_pitch: Maximum MIDI pitch to encode (inclusive). add_eos: Whether or not to add an EOS event to the end of encoded performances. ngrams: Optional list of performance event n-grams (tuples) to be represented by new indices. N-grams must have length at least 2 and should be pre-offset by the number of reserved IDs. Raises: ValueError: If any n-gram has length less than 2, or contains one of the reserved IDs. """ self._steps_per_second = steps_per_second self._num_velocity_bins = num_velocity_bins self._add_eos = add_eos self._ngrams = ngrams or [] for ngram in self._ngrams: if len(ngram) < 2: raise ValueError('All n-grams must have length at least 2.') if any(i < self.num_reserved_ids for i in ngram): raise ValueError('N-grams cannot contain reserved IDs.') self._encoding = magenta.music.PerformanceOneHotEncoding( num_velocity_bins=num_velocity_bins, max_shift_steps=steps_per_second, min_pitch=min_pitch, max_pitch=max_pitch) # Create a trie mapping n-grams to new indices. ngram_ids = range(self.unigram_vocab_size, self.unigram_vocab_size + len(self._ngrams)) self._ngrams_trie = pygtrie.Trie(zip(self._ngrams, ngram_ids)) # Also add all unigrams to the trie. self._ngrams_trie.update(zip([(i,) for i in range(self.unigram_vocab_size)], range(self.unigram_vocab_size)))
def _read_word_list(word_list_filepath: Path = DEFAULT_WORD_LIST_PATH) -> pygtrie.Trie: """ :param Path word_list_filepath: Path to a file containing the list of valid words :return: Trie containing all valid words :rtype: pygtrie.Trie """ word_list = pygtrie.Trie() with open(word_list_filepath, mode="r") as word_list_file: # Word list file must contain one word per line for word in word_list_file: word_list[word.strip()] = True return word_list
def _read_file(self, path): """ :param path: path of a file :return: Trie object with all counted word """ trie = tri.Trie() with open(path, 'r', buffering=DEFAULT_BUFFER_SIZE) as f: for line in f: for word in line.split(): word = self.__char_to_remove.sub('', word.lower()) if word in trie: trie[word] += 1 else: trie[word] = 1 return trie
def identical_subarray_trie(arr1, arr2): t = pygtrie.Trie() large = small = [] if len(arr1) <= len(arr2): small, large = arr1, arr2 else: small, large = arr2, arr1 for item in small: print item t[item] = item print if t.values([1, 2, 3]): print
def test_large_trie(self): """Test handling of large tries which would overflow stack.""" tostring = ( getattr(array.array, 'tobytes', None) or # Python 3 getattr(array.array, 'tostring')) # Python 3 trie = pygtrie.Trie() for x in range(100): y = tostring(array.array('h', range(x, 1000))) trie[y] = x # Plain iteration n = 0 for _ in trie.iteritems(): n += 1 self.assertEqual(100, n) # Copy self.assertEqual(trie, copy.copy(trie)) self.assertEqual(trie, copy.deepcopy(trie))
def __init__(self, language: str="ru", mode: Mode=Mode.GRAPHEMES, raw_dict_path=None, trie_path=None, zalyzniak_dict=ZALYZNYAK_DICT, cmu_dict=CMU_DICT) -> None: self.data = pygtrie.Trie() self.raw_dict_path = raw_dict_path self.trie_path = trie_path if language == "ru" and mode == self.Mode.GRAPHEMES: self.__init_defaults(RU_GRAPHEME_STRESS_PATH, RU_GRAPHEME_STRESS_TRIE_PATH) if not os.path.exists(self.raw_dict_path): ZalyzniakDict.convert_to_accent_only(zalyzniak_dict, self.raw_dict_path) elif mode == self.Mode.PHONEMES and language == "en": self.__init_defaults(EN_PHONEME_STRESS_PATH, EN_PHONEME_STRESS_TRIE_PATH) if not os.path.exists(self.raw_dict_path): CMUDict.convert_to_phoneme_stress(cmu_dict, self.raw_dict_path) else: assert False if not os.path.isfile(self.raw_dict_path): raise FileNotFoundError("Не найден файл словаря.") if os.path.isfile(self.trie_path): self.load(self.trie_path) else: self.create(self.raw_dict_path, self.trie_path)
def populate_dictionary_trie(): global is_dictionary_trie_populated global DICTIONARY_TRIE if is_dictionary_trie_populated == True and DICTIONARY_TRIE is not None: # To avoid re-populating the Trie if it has already been created and populated inMemory return DICTIONARY_TRIE # Trie datastructure for storing dictionary words and fast retrieval, and prefix matching DICTIONARY_TRIE = trie.Trie() # https://stackoverflow.com/a/6475407/3766839 with open(os.path.join(get_script_path(), "dictionary.txt"), "r") as file: for word in file: word = str(word.upper()).rstrip( ) # stripping trailing newline characters of words from file, and making uppercase # Not including 2 In-frequent Letter words since they are pretty random (LA, FR, etc) and give bad outputs if len(word) <= MAX_WORD_LENGTH_DICTIONARY and len( word) >= MIN_WORD_LENGTH_DICTIONARY: DICTIONARY_TRIE[word] = True is_dictionary_trie_populated = True return DICTIONARY_TRIE
file_list = {} file_id = 0 for d, subdirs, files in os.walk(args.dir): for f in files: if pat.search(f): print('processing {} ...'.format(os.path.join(d, f))) file_list[file_id] = os.path.join(d, f) file_id = file_id + 1 with open(os.path.join(d, f), 'r') as content_file: content = content_file.read() fm.push_back(content) fm.build() seen_clones = set() clone_fragments = pygtrie.Trie() clones = pygtrie.Trie() def add_clone(seq, v): for item in clones.prefixes(seq): if item[1][0] <= v[0]: del clones[item[0]] clones[seq] = v for i in range(1, len(seq)): for item in clone_fragments.prefixes(seq[i:]): if item[1][0] <= v[0]: del clone_fragments[item[0]] clone_fragments[seq[i:]] = v
def updateChoices(self, choices): self.choices = sorted(filter(None, choices)) self.searchtree = trie.Trie()
def gen(cs, N, size): words = numpy.random.choice(cs, [N, size]) t = pygtrie.Trie() for word in words: t[str(word)] = str(word) return t
def _load_suffixes(): suffixes = trie.Trie() with open(os.path.join(RESOURCE_PATH, 'suffixes_list.txt'), 'r', encoding='utf-8') as fp: for suffix in fp.read().split('\n'): suffixes[suffix[::-1]] = suffix return suffixes
def init_path_details(self, path, sg): print("going to initiate path details for: ", path) # to return: path_execs = [] path_join_keys = [] path_tries = [] first_node = path[0] node_info = sg.nodes()[first_node] table = ALIAS_FORMAT.format(TABLE=node_info["real_name"], ALIAS=first_node) nodes_seen = set() if first_node not in self.init_sels: sels = ",".join(node_info["sels"]) where_clause = "" if len(node_info["predicates"]) > 0: preds = " AND ".join(node_info["predicates"]) where_clause = "WHERE " + preds exec_sql = FIRST_HOP_TMP.format(SELS=sels, TABLE=table, WHERE=where_clause) self.cursor.execute(exec_sql) outputs = self.cursor.fetchall() self.init_sels[first_node] = outputs print("computed first hop outputs: ", len(outputs)) # for first node nodes_seen.add(first_node) path_execs.append(None) path_join_keys.append(None) # for rest of the path, compute join statements if self.use_tries: print("creating tries...") path_tries.append(None) for node_idx in range(1, len(path), 1): created_index = False node = path[node_idx] node_info = sg.nodes()[node] table = ALIAS_FORMAT.format(TABLE=node_info["real_name"], ALIAS=node) sels = ",".join(node_info["sels"]) join_edges = list(nx.edge_boundary(sg, nodes_seen, {node})) assert len(join_edges) != 0 nodes_seen.add(node) # FIXME: check triangle condition if self.use_tries: join = join_edges[0] path_execs.append(None) where_clause = "" if len(node_info["predicates"]) > 0: preds = " AND ".join(node_info["predicates"]) where_clause = "WHERE " + preds exec_sql = FIRST_HOP_TMP.format(SELS=sels, TABLE=table, WHERE=where_clause) print("exec sql for trie: ", exec_sql) cur_join_col = sg[join[0]][join[1]][join[1]] other_col = sg[join[0]][join[1]][join[0]] print("cur join col: ", cur_join_col) print("other col: ", other_col) trie_idx = None for sel_i, sel in enumerate(node_info["sels"]): if sel == cur_join_col: trie_idx = sel_i assert trie_idx is not None path_join_keys.append([other_col]) trie_key_name = node_info["sels"][trie_idx] sql_key = deterministic_hash(exec_sql + trie_key_name) if sql_key in self.trie_cache: kl_start = time.time() trie = self.trie_cache[sql_key] print("loading trie {} from in memory klepto took: {}". format(node, time.time() - kl_start)) elif sql_key in self.trie_cache.archive: # trie = None kl_start = time.time() trie = self.trie_cache.archive[sql_key] print("loading trie {} from klepto took: {}".format( node, time.time() - kl_start)) else: st = time.time() self.cursor.execute(exec_sql) outputs = self.cursor.fetchall() trie = pygtrie.Trie() for out in outputs: if str(out[trie_idx]) not in trie: trie[str(out[trie_idx])] = [] trie[str(out[trie_idx])].append(out) trie_time = time.time() - st print("trie for {}, len: {}, took: {}".format( node, len(outputs), trie_time)) self.total_trie_time += trie_time if trie_time > TRIE_USE_THRESHOLD: trie = None self.trie_cache.archive[sql_key] = trie elif trie_time > TRIE_ARCHIVE_THRESHOLD \ and trie_time < TRIE_USE_THRESHOLD: self.trie_cache.archive[sql_key] = trie # no matter what, store in memory cache so we avoid reloading # it from archive in the next path self.trie_cache[sql_key] = trie path_tries.append(trie) if trie is None: path_join_keys[-1] = None else: path_tries.append(None) path_join_keys.append(None) if path_tries[-1] is None: fkey_conds = [] cur_join_cols = [] index_cols = [] join = join_edges[0] assert node == join[1] # a value for this column would already have been selected other_col = sg[join[0]][join[1]][join[0]] cur_join_cols.append(other_col) # other_val = vals[other_col] cur_col = sg[join[0]][join[1]][join[1]] col_name = cur_col.split(".")[1] if col_name not in index_cols: index_cols.append(col_name) other_col_key = "X" + other_col + "X" cond = cur_col + " = " + other_col_key fkey_conds.append(cond) # path_join_keys.append(cur_join_cols) assert path_join_keys[-1] is None path_join_keys[-1] = cur_join_cols assert len(fkey_conds) != 0 # FIXME: check math fkey_conds += node_info["predicates"] fkey_cond = " AND ".join(fkey_conds) for col in node_info["pred_cols"]: col_name = col.split(".")[1] if col_name not in index_cols: index_cols.append(col_name) exec_sql = NEXT_HOP_TMP.format(FKEY_CONDS=fkey_cond, TABLE=table, SELS=sels) # assert path_execs[-1] is None if path_execs[-1] is not None: print(exec_sql) print(path_execs) pdb.set_trace() path_execs[-1] = exec_sql return path_execs, path_join_keys, path_tries
def __init__(self, words): self.nodes = trie.Trie() self.maketrie(words)
def __init__(self): self.tree = trie.Trie()
def main_algo(features, tweetid, lastclusterid): fvecs, freqdict = tfidf_all.get_tfidf_freqdict(features) # Creating random vectors num_randvecs = 13 random_vectors = randomvecs.getVecs(len(freqdict), num_randvecs) # Initialising prefix trees a = [] b = [] prime = 13 P = [] # modP = int(input("Enter number of permutations to be used : ")) modP = 20 for i in range(modP): atemp = random.uniform(1, prime) btemp = random.uniform(0, prime) a.append(atemp) b.append(btemp) P.append(pygtrie.Trie()) index = 0 wordindexmap = {} for key in freqdict.keys(): wordindexmap[key] = index index = index + 1 # MAIN TWEET LOOP tweetclustermap = {} clusterdict = {} for fvec in fvecs: tweetsign = signature.getSign(fvec, random_vectors, wordindexmap) # Insert tweet signature in prefix tree and find its nearest neighbor in that tree nearestNeighbours = [] for i in range(modP): signPerm = [None] * len(tweetsign) for x in range(len(tweetsign)): ind = int(a[i] * x + b[i]) % prime signPerm[x] = tweetsign[ind] if P[i].has_key(signPerm): P[i][signPerm].append(tweetid) else: P[i][signPerm] = [tweetid] neighbor, hdist = nearest_neighbor.getNN(signPerm, P[i]) if (neighbor == None): None elif hdist == 0: neighbor.remove(tweetid) nearestNeighbours.append((neighbor, hdist)) elif hdist == 1: nearestNeighbours.append((neighbor, hdist)) elif (hdist > 1): templist = [] for item in neighbor: templist += item[1] nearestNeighbours.append((templist, hdist)) mindist = len(signPerm) + 10 closestNeighbors = [] for pair in nearestNeighbours: if pair[1] <= mindist: mindist = pair[1] for pair in nearestNeighbours: if pair[1] == mindist: for i in range(len(pair[0])): if not pair[0][i] in closestNeighbors: closestNeighbors.append(pair[0][i]) # T = float(input("Enter the similarity threshold : ")) T = 0.05 tweetclustermap[0] = 0 clusterdict[0] = [0] for cneighbor in closestNeighbors: if (similarity.cosine_similarity(fvec, fvecs[cneighbor]) >= T): if (tweetid in tweetclustermap.keys()): if (not (tweetclustermap[tweetid] == tweetclustermap[cneighbor])): tweetclustermap[tweetid] = tweetclustermap[cneighbor] clusterdict[tweetclustermap[cneighbor]].append(tweetid) else: tweetclustermap[tweetid] = tweetclustermap[cneighbor] clusterdict[tweetclustermap[cneighbor]].append(tweetid) else: if (not (tweetid in tweetclustermap.keys())): tweetclustermap[tweetid] = lastclusterid + 1 clusterdict[lastclusterid + 1] = [tweetid] lastclusterid += 1 tweetid = tweetid + 1 return clusterdict, fvecs, freqdict
"Turkmenistan", "Tuvalu", "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom", "United States", "Uruguay", "Uzbekistan", "Vanuatu", "Vatican City", "Venezuela", "Vietnam", "Yemen", "Zambia", "Zimbabwe", "↑ UN member states and observer states ↑", "", "↓ Other states ↓", "Abkhazia", "Artsakh", "Cook Islands", "Kosovo", "Niue", "Northern Cyprus", "Sahrawi Arab Democratic Republic", "Somaliland", "South Ossetia", "Taiwan", "Transnistria", "↑ Other states ↑" ] us_states = [ "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky[E]", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming" ] x = trie.Trie() output = {} for country in countries: for i in country.split(): if not x.has_key(i): x[i] = [] x[i].append(country) a = 10
def __init__(self): self.data = pygtrie.Trie()
def train(self): print "counting training doc ..." pattern = re.compile(u'[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!@#$%^&*\\-_=+a-zA-Z,。《》、?:;“”‘’{}【】()…¥!—┄-]+') # candidates = hat_trie.Trie() candidates = pygtrie.Trie() self.doc_length = 0 # 注意,doc 初始为一个空格,doc[0] 位置的字符不作为每次遍历的目标 # 而只是作为 doc[1] 的左邻居,这样避免训练下一个批次时,丢掉了起点的左邻 doc = u' ' line_cnt = 0 with codecs.open(self.doc, 'r', 'utf-8') as f: for line in f: line = re.sub(pattern, '', line) self.doc_length += len(line) doc += line line_cnt += 1 if line_cnt % 10000 == 0: print "{} lines processed".format(line_cnt) # if line_cnt == 110000: # break # 每 batch_size 个汉字处理一次 if len(doc) < self.batch_size: continue length = len(doc) # 从 1 开始遍历,目的是保留上次遍历留下来的左邻居 # 不取到 length,目的是保证从这一轮循环的每个起点起, # 都能取到 self.max_word 长度的字串 # 比如 length = 10 ==> 0 1 2 3 4 5 6 7 8 9 # self.max_word = 5 ==> i 最多取到 4,这样可以取到 # "45678" 字串,而且能取到右邻居 ‘9’ for i in xrange(1, length - self.max_word): for j in xrange(i + 1, i + self.max_word + 1): text = doc[i: j] if text not in candidates: candidates[text] = Word(text) candidates[text].meet(doc[i - 1: i], doc[j: j + 1]) # 本批次处理完毕,准备处理下一批次,那么前面处理过的字符可以删掉了 # 但是,最后的一个字符不能删,因为要作为下一个批次的 doc[0],即左邻居 doc = doc[length - self.max_word - 1:] # 循环完毕,那么 doc 中剩下一些不到 self.batch_size 长的字符,需要做一下处理 length = len(doc) # 同样,跳过 doc[0] for i in xrange(1, length): for j in xrange(i + 1, min(i + self.max_word + 1, length + 1)): text = doc[i: j] if text not in candidates: candidates[text] = Word(text) candidates[text].meet(doc[i - 1: i], doc[j: j + 1]) # 计算 freq 和左右邻熵 print "making statistics ..." # hat_trie has not iteritems() func # for text in candidates.iterkeys(): # candidates[text].statistics(self.doc_length) for _, word in candidates.iteritems(): word.statistics(self.doc_length) # 至此,全部 freq 和左右熵都被计算完毕,可以计算凝固度、内部熵,并得到最终得分了 print "calculating aggregations ...." # for text in candidates.iterkeys(): # if len(text) < 2: # continue # word = candidates[text] for text, word in candidates.iteritems(): if len(text) < 2: continue word.aggreg = Algorithm.aggregation(word, candidates) word.inner = Algorithm.inner_entropy(word, candidates) word.score = word.aggreg + min(word.left, word.right) - word.inner # 到这里,单个的词已经无用了,后面词库只记录双字以上的词 # self.words = sorted([candidates[text] for text in candidates.iterkeys() if len(text) > 1], key=lambda v: v.freq, reverse=True) self.words = sorted([word for text, word in candidates.iteritems() if len(text) > 1], key=lambda v: v.freq, reverse=True) # 一些统计数据 total = float(len(self.words)) print "Avg len: ", sum([len(w.text) for w in self.words]) / total print "Avg freq: ", sum([w.freq for w in self.words]) / total print "Avg left ent: ", sum([w.left for w in self.words]) / total print "Avg right ent: ", sum([w.right for w in self.words]) / total print "Avg aggreg: ", sum([w.aggreg for w in self.words]) / total print "Avg inner ent: ", sum([w.inner for w in self.words]) / total print "Avg score: ", sum([w.score for w in self.words]) / total # 保存当前结果 with codecs.open("candidates_statistics.csv", "w", "utf-8") as f: for w in self.words: f.write(u"{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(w.text, w.freq, w.left, w.right, w.aggreg, w.inner, w.score))
# -*- coding: utf-8 -*- import re import django import codecs import json import unicodecsv import pygtrie from collections import defaultdict django.setup() from sefaria.model import * maxrabbilen = 0 with open("RabbisNames.csv", 'rb') as fin: tonorabbiscsv = unicodecsv.DictReader(fin) tonorabbis = pygtrie.Trie() for row in tonorabbiscsv: rabbiName = u"" for i in range(1, 11): tempName = row[u"Name{}".format(i)] if not tempName: break if i > 1: rabbiName += u" " rabbiName += tempName rabbiName = rabbiName.replace(u"ר'", u"רבי") if len(rabbiName) > maxrabbilen: maxrabbilen = len(rabbiName) tonorabbis[rabbiName] = 0 pass