def run(argv): if not should_replace(cpra_to_rsids_trie_fname) and not should_replace( rsid_to_cpra_trie_fname): print('tries are up-to-date!') else: with open(os.path.join(conf.data_dir, 'sites', 'sites.tsv'), 'rt') as f: lines = [parse_line(line) for line in f] print('done loading.') cpra_to_rsids_trie = marisa_trie.BytesTrie( lines, order=marisa_trie.LABEL_ORDER) cpra_to_rsids_trie.save(cpra_to_rsids_trie_fname) print('done with cpra->rsids trie at ' + cpra_to_rsids_trie_fname) # TODO: What if several different chrom-pos-ref-alts have the same rsid? Do we only get the first? Or the last? reversed_lines = ((rsid.decode(), cpra.encode()) for (cpra, rsids) in lines for rsid in rsids.split(b',')) rsid_to_cpra_trie = marisa_trie.BytesTrie( reversed_lines, order=marisa_trie.LABEL_ORDER) rsid_to_cpra_trie.save(rsid_to_cpra_trie_fname) print('done with rsid->cpra trie at ' + rsid_to_cpra_trie_fname)
def run(argv): if '-h' in argv or '--help' in argv: print('Make tries for converting between chr-pos-ref-alt and rsid') exit(1) if not should_replace(cpra_to_rsids_trie_filepath) and not should_replace(rsid_to_cpra_trie_filepath): print('tries are up-to-date!') else: # Note: two identical VariantFileReaders are made in order to allow streaming to reduce memory usage. # a different trie library might allow feeding both tries while reading from a file, but marisa_trie doesn't. with VariantFileReader(sites_filepath) as reader: cpras_and_rsids = (('{chrom}-{pos}-{ref}-{alt}'.format(**v), v['rsids'].encode('ascii')) for v in reader) cpra_to_rsids_trie = marisa_trie.BytesTrie(cpras_and_rsids, order=marisa_trie.LABEL_ORDER) cpra_to_rsids_trie.save(cpra_to_rsids_trie_filepath) print('done with cpra->rsids trie at ' + cpra_to_rsids_trie_filepath) # Note: if several different chrom-pos-ref-alts have the same rsid, then `trie[rsid]` = `[cpra1, cpra2, ...]`. with VariantFileReader(sites_filepath) as reader: def get_rsids_and_cpras(): for v in reader: if v['rsids']: cpra = '{chrom}-{pos}-{ref}-{alt}'.format(**v).encode('ascii') for rsid in v['rsids'].split(','): yield (rsid, cpra) rsid_to_cpra_trie = marisa_trie.BytesTrie(get_rsids_and_cpras(), order=marisa_trie.LABEL_ORDER) rsid_to_cpra_trie.save(rsid_to_cpra_trie_filepath) print('done with rsid->cpra trie at ' + rsid_to_cpra_trie_filepath)
def run(argv): if not should_replace(cpra_to_rsids_trie_filepath) and not should_replace( rsid_to_cpra_trie_filepath): print('tries are up-to-date!') else: with VariantFileReader(sites_filepath) as reader: cpras_and_rsids = [('{chrom}-{pos}-{ref}-{alt}'.format(**v), v['rsids'].encode('ascii')) for v in reader] print('done loading.') cpra_to_rsids_trie = marisa_trie.BytesTrie( cpras_and_rsids, order=marisa_trie.LABEL_ORDER) cpra_to_rsids_trie.save(cpra_to_rsids_trie_filepath) print('done with cpra->rsids trie at ' + cpra_to_rsids_trie_filepath) # TODO: What if several different chrom-pos-ref-alts have the same rsid? Do we only get the first? Or the last? rsids_and_cpras = ((rsid, cpra.encode('ascii')) for (cpra, rsids) in cpras_and_rsids for rsid in rsids.decode('ascii').split(',') if rsid) rsid_to_cpra_trie = marisa_trie.BytesTrie( rsids_and_cpras, order=marisa_trie.LABEL_ORDER) rsid_to_cpra_trie.save(rsid_to_cpra_trie_filepath) print('done with rsid->cpra trie at ' + rsid_to_cpra_trie_filepath)
def create_trie(): keys = [] values = [] start = time.time() db = Database(None) print "Connected to DB in", time.time() - start, "secs" start = time.time() db.execute("SELECT * FROM `Final_consolidated`") print "Retrieved Info from table in", time.time() - start, "secs" for row in db.cursor: value = [] for i, ele in enumerate(row): if i == 2: keys.append(ele.lower()) else: value.append(ele) values.append(json.dumps(value)) trie = marisa_trie.Trie() print "Starting to zip into Marisa trie" trie = marisa_trie.BytesTrie(zip(keys, values)) print "Completed zipping into Marisa trie" print "Pickling Trie" with open('consolidated.pkl', 'wb') as output: pickle.dump(trie, output, pickle.HIGHEST_PROTOCOL) print "Total number of Keys:", len(keys)
def test_iteritems(keys, values): trie = marisa_trie.BytesTrie(zip(keys, values)) assert trie.items() == list(trie.iteritems()) for key in keys: prefix = key[:5] assert trie.items(prefix) == list(trie.iteritems(prefix))
def create_from_other(self, compdawg): d = count_prefixes(compdawg) d = dict([(k, v.pop()) for k, v in d.iteritems() if len(v) == 1]) self.trie = marisa_trie.BytesTrie( ((k, v.encode("utf-8")) for k, v in d.iteritems())) self.dawg = dawg.BytesDAWG( ((k, v.encode("utf-8")) for k, v in d.iteritems()))
def _pack_multiple_nodes(keys, nodes): values = [] for node in nodes: values.append(bytes(node.pack_to_string(), encoding='utf8')) binary_trie = mt.BytesTrie(zip(keys, values)) return binary_trie
def test_dumps_loads(data): trie = marisa_trie.BytesTrie(data) buf = io.BytesIO() pickle.dump(trie, buf) buf.seek(0) assert trie == pickle.load(buf)
def test_getitem_multiple(self): data = [ ('foo', b'x'), ('fo', b'y'), ('foo', b'a'), ] trie = marisa_trie.BytesTrie(data) assert trie['fo'] == [b'y'] assert trie['foo'] == [b'a', b'x']
def test_getitem_fuzzy(self): data = self.data() trie = marisa_trie.BytesTrie(data) for key, value in data: assert trie[key] == [value] with pytest.raises(KeyError): trie['2135']
def test_contains(self): data = self.data() trie = marisa_trie.BytesTrie(data) for key, value in data: assert key in trie non_key = '2135' assert non_key not in trie
def test_getitem_multiple(): data = [ ("foo", b"x"), ("fo", b"y"), ("foo", b"a"), ] trie = marisa_trie.BytesTrie(data) assert trie["fo"] == [b"y"] assert trie["foo"] == [b"a", b"x"]
def load_trie(filename): """ Load a BytesTrie from the marisa_trie on-disk format. """ trie = marisa_trie.BytesTrie() # marisa_trie raises warnings that make no sense. Ignore them. with warnings.catch_warnings(): warnings.simplefilter("ignore") trie.load(filename) return trie
def test_contains(keys, values, missing_key): assume(missing_key not in keys) data = zip(keys, values) trie = marisa_trie.BytesTrie(data) for word, value in data: assert word in trie assert missing_key not in trie
def __init__(self, phenos): self._phenos = copy.deepcopy(phenos) self._preprocess_phenos() self._cpra_to_rsids_trie = marisa_trie.BytesTrie().load( common_filepaths['cpra-to-rsids-trie']) self._rsid_to_cpra_trie = marisa_trie.BytesTrie().load( common_filepaths['rsid-to-cpra-trie']) self._gene_alias_trie = marisa_trie.BytesTrie().load( common_filepaths['gene-aliases-trie']) self._autocompleters = [ self._autocomplete_variant, self._autocomplete_rsid, self._autocomplete_phenocode, self._autocomplete_gene, ] if any('phenostring' in pheno for pheno in self._phenos.values()): self._autocompleters.append(self._autocomplete_phenostring)
def test_iteritems(self): keys = get_random_words(1000) values = get_random_binary(1000) trie = marisa_trie.BytesTrie(zip(keys, values)) assert trie.items() == list(trie.iteritems()) for key in keys: prefix = key[:5] assert trie.items(prefix) == list(trie.iteritems(prefix))
def build_trie(file): keys = [] values = [] with codecs.open(file, 'r', 'utf-8') as f: for line in f.readlines(): parts = line.split(u'\t') keys.append(parts[1].strip()) # byte tire needs bytes values.append(parts[0].strip().encode('utf-8')) return marisa_trie.BytesTrie(zip(keys, values))
def test_getitem(keys, values, missing_key): assume(missing_key not in keys) data = zip(keys, values) trie = marisa_trie.BytesTrie(data) for key, value in data: assert trie[key] == [value] with pytest.raises(KeyError): trie[missing_key]
def test_has_keys_with_prefix(): fruit_trie = marisa_trie.BytesTrie([ ('apple', b'foo'), ('pear', b'bar'), ('peach', b'baz'), ]) assert fruit_trie.has_keys_with_prefix('') assert fruit_trie.has_keys_with_prefix('a') assert fruit_trie.has_keys_with_prefix('pe') assert fruit_trie.has_keys_with_prefix('pear') assert not fruit_trie.has_keys_with_prefix('x')
def test_null_bytes_in_values(self): keys = get_random_words(10000) values = get_random_binary(10000) assert any(b'\x00' in p for p in values) data = zip(keys, values) trie = marisa_trie.BytesTrie(data) for key, value in data: assert trie[key] == [value]
def test_pickling(self): trie = marisa_trie.BytesTrie([ ('foo', b'foo'), ('bar', b'bar'), ]) buf = io.BytesIO() pickle.dump(trie, buf) buf.seek(0) trie2 = pickle.load(buf) assert trie2['foo'] == [b'foo'] assert trie2['bar'] == [b'bar']
def __init__(self, seq): self.nw_dst = marisa_trie.BytesTrie(zip(seq.nw_dst, seq.dl_vlan)) self.nw_src = marisa_trie.BytesTrie(zip(seq.nw_src, seq.dl_vlan)) self.metadata = marisa_trie.BytesTrie(zip(seq.metadata, seq.dl_vlan)) self.in_port = marisa_trie.BytesTrie(zip(seq.in_port, seq.dl_vlan)) self.dl_dst = marisa_trie.BytesTrie(zip(seq.dl_dst, seq.dl_vlan)) self.dl_src = marisa_trie.BytesTrie(zip(seq.dl_src, seq.dl_vlan))
def analyze(input): #만약 사전 컴파일 중이면 5초 기다려주기 if is_compiling(): time.sleep(5) compile_dic = marisa_trie.BytesTrie().load(COMPILED_NE_DIC_PATH) morphs, search_iter, search_index, morph_result = gen_prefix_array(input) # resultlist = [] ne_list = [] ne_tag_list = [] longest = True for i in range(len(search_iter)): candi = search_iter[i] # print (self.searchIndex[idx]) findlist = compile_dic.prefixes(candi) LOGGER.info('candi={}, findlist={}'.format(candi, findlist)) if len(findlist) == 0: continue # 최장일치 if longest: item = max(findlist, key=len) # print("item=", item) # for tag in compile_dic: if item in compile_dic and item not in ne_list: # values = [value.decode('utf-8') for value in compile_dic[item]] # resultlist.append((item, values)) for value in compile_dic[item]: ne_list.append(item) ne_tag_list.append(value.decode('utf-8')) # 최단일치 else: for item in findlist: # print("item1=", item) # for tag in compile_dic: # resultlist.append((item, tag)) if item in compile_dic: # values = [value.decode('utf-8') for value in compile_dic[item]] # resultlist.append((item, values)) for value in compile_dic[item]: ne_list.append(item) ne_tag_list.append(value.decode('utf-8')) # 개체명 분석 결과 구조체 ne_result = { 'ne': ne_list , 'neTags': ne_tag_list , 'neNum': len(ne_list) , 'morphResult': morph_result } return ne_result
def test_items(self): data = [ ('fo', b'y'), ('foo', b'x'), ('foo', b'a'), ] trie = marisa_trie.BytesTrie(data) assert set(trie.items()) == set(data) assert set(trie.items('f')) == set(data) assert set(trie.items('fo')) == set(data) assert set(trie.items('foo')) == set(data[1:]) assert trie.items('food') == [] assert trie.items('bar') == []
def test_items(): data = [ ("fo", b"y"), ("foo", b"x"), ("foo", b"a"), ] trie = marisa_trie.BytesTrie(data) assert set(trie.items()) == set(data) assert set(trie.items("f")) == set(data) assert set(trie.items("fo")) == set(data) assert set(trie.items("foo")) == set(data[1:]) assert trie.items("food") == [] assert trie.items("bar") == []
def _read_ipa2xs(self): path = os.path.join('data', self.ipa2xs_fn) path = pkg_resources.resource_filename(__name__, path) pairs = [] with open(path, 'rb') as f: reader = csv.reader(f, encoding='utf-8') next(reader) for ipa, xs, _ in reader: pairs.append(( ipa, xs.encode('utf-8'), )) trie = marisa_trie.BytesTrie(pairs) return trie
def test_keys(self): trie = marisa_trie.BytesTrie([ ('foo', b'x'), ('fo', b'y'), ('foo', b'a'), ]) # FIXME: ordering? assert trie.keys() == ['foo', 'foo', 'fo'] assert trie.keys('f') == ['foo', 'foo', 'fo'] assert trie.keys('fo') == ['foo', 'foo', 'fo'] assert trie.keys('foo') == ['foo', 'foo'] assert trie.keys('food') == [] assert trie.keys('bar') == []
def test_keys(): trie = marisa_trie.BytesTrie([ ("foo", b"x"), ("fo", b"y"), ("foo", b"a"), ]) # FIXME: ordering? assert trie.keys() == ["foo", "foo", "fo"] assert trie.keys("f") == ["foo", "foo", "fo"] assert trie.keys("fo") == ["foo", "foo", "fo"] assert trie.keys("foo") == ["foo", "foo"] assert trie.keys("food") == [] assert trie.keys("bar") == []
def Main(argv): # ArgParser parser = argparse.ArgumentParser(usage=PrintHelp()) parser.add_argument("marisa", help="Cesta ku slovníku vo formáte marisa_trie.") args = parser.parse_args() # Nacitanie trie trie = marisa_trie.BytesTrie() trie.load(args.marisa) print("Morphological analyser") # Vstup for line in sys.stdin: for word in line.split(): if (word == "."): sys.exit() PrintTrie(trie, word)
def run(argv): if not os.path.exists(common_filepaths['genes']): print('Downloading genes') from . import download_genes download_genes.run([]) aliases_filepath = common_filepaths['gene-aliases-trie'] if not os.path.exists(aliases_filepath): print('gene aliases will be stored at {!r}'.format(aliases_filepath)) mapping = get_gene_aliases() mapping = [(a, cs.encode('ascii')) for a, cs in mapping.items()] aliases_trie = marisa_trie.BytesTrie(mapping) aliases_trie.save(aliases_filepath) else: print('gene aliases are at {!r}'.format(aliases_filepath))