def __init__(self): self.c_tst = tst.TST() c_data = open(college_path) c_map_data = dict([x.strip().split(" ") for x in c_data]) c_data.close() for k, v in c_map_data.items(): self.c_tst.put(k, v) self.city_tst = tst.TST() self.province_tst = tst.TST() city_data = open(city_path) city_data2 = [x.strip().split(" ") for x in city_data] city_data.close() provinces = set([x[0] for x in city_data2]) province_data = set([l.strip() for l in open(province_path).readlines()]) provinces = provinces.union(province_data) for x in provinces: self.province_tst.put(x, x) for v in city_data2: self.city_tst.put(v[1], v) self.country_tst = tst.TST() country_data = open(country_path) countries = set([l.strip().split(" ")[0] for l in country_data.readlines()]) country_data.close() for c in countries: self.country_tst.put(c, c)
def split_entities(w, entity_words): """ >>> #split_entities("我爱北京天T恤安北京门", ['T恤','北京']) ['\\xe6\\x88\\x91\\xe7\\x88\\xb1', '\\xe5\\x8c\\x97\\xe4\\xba\\xac', '\\xe5\\xa4\\xa9', 'T\\xe6\\x81\\xa4', '\\xe5\\xae\\x89', '\\xe5\\x8c\\x97\\xe4\\xba\\xac', '\\xe9\\x97\\xa8'] """ t = tst.TST() for ew in entity_words: t.put(ew, ew) results = t.scan(w, tst.TupleListAction()) return [(x[0], x[1]) for x in results]
def buildTSTDictionary(self, wordList, saveFile="..\\resources\\code\\arg-dictionary.tst"): t = tst.TST() for word in wordList: t[word] = 1 try: t.write_to_file(saveFile) return True except IOError: return False
def buildTST(): t = tst.TST() t.bulk_add(corpus) return t
def loadTSTDictionary(self, fileName): self.dict = tst.TST() try: self.dict.read_from_file(fileName) except IOError: self.dict = None
def MultiReplaceWithDict(replace_dict): t = tst.TST() for k, v in replace_dict.iteritems(): t[k] = v return MultiReplaceWithTST(t)
unicodedata.name(c)[0:8] == "KATAKANA" or unicodedata.name(c)[0:3] == "CJK" or unicodedata.name(c)[0:5] == "DIGIT" or unicodedata.name(c)[0:5] == "LATIN"): return False return True def createKeywordTree(filename): """ファイルに登録されている見出し語からキーワードツリーを作成して返す""" try: fp = codecs.open(filename, "r", "utf-8") except IOError, e: print e return None tree = tst.TST() for line in fp: word = line.rstrip() if isValid(word): tree[word] = True # wordをTSTに登録 fp.close() return tree def analysis(text, tree): """textからWikipedia見出し語を抽出する""" keywords = [] result = tree.scan(text, tst.TupleListAction()) for t in result: if t[2] == True: try: word = unicode(t[0], "utf-8")
def template_setup(self): """template_setup() Sets up TST tree for matching. A separate tree is built for each distinct frame value (e. g. one tree for rtheader matches, one for address matches). """ def process_pattern(base_pattern, pattern_tag, pattern_type): """process_pattern() Preprocesses patterns, e. g. normalises pattern and expands wildcards. """ max_wildcards = 6 hexdigits_uppercase = '0123456789ABCDEF' subpattern_list = [] wildcard_alphabet = hexdigits_uppercase # wildcard alphabet defults to hex characters def check_pattern(pattern, tag): """check_pattern() Checks if pattern has correct structure. """ if tag == 'ADDR1' or tag == 'ADDR2' or tag == 'ADDR3' or tag == 'ADDR4': if len(pattern) != 12: self.template_logger.warning("Length of address " + pattern + " invalid") return False elif not pattern.replace('*', '').isalnum(): self.template_logger.warning( "Address " + pattern + " contains invalid characters") return False elif tag == 'FTYPE' or tag == 'FSUBTYPE': if not pattern.replace('*', '').isalpha(): self.template_logger.warning( "Type or subtype " + pattern + " contains invalid characters") return False return True def expand_wildcards(subpattern, wildcard_index): """expand_wildcards() Is called recursively if a wildcard is encountered. Creates a list of all possible subpatterns. """ wildcard_index = subpattern.find('*', wildcard_index) if wildcard_index < 0: subpattern_list.append(subpattern) return True else: for wildcard_char in wildcard_alphabet: subsubpattern = subpattern.replace( subpattern[wildcard_index], wildcard_char, 1 ) # replace the first wildcard with the next character in substitution alphabet if wildcard_index == subpattern.rfind('*'): subpattern_list.append(subsubpattern) else: expand_wildcards(subsubpattern, wildcard_index) # Preprocess pattern base_pattern_normalised = str(base_pattern).upper().replace( ' ', '') if check_pattern(base_pattern_normalised, pattern_tag): if '*' in base_pattern_normalised: if base_pattern_normalised.count('*') <= max_wildcards: # set alphabet to substitute wildcards with. if pattern_type == 'word': wildcard_alphabet = string.ascii_letters + string.digits + ' ' + string.punctuation.replace( '*', '' ) # wildcard is not inserted, would result in infinite loop elif pattern_type == 'bit': wildcard_alphabet = '01' elif pattern_type == 'decimal': wildcard_alphabet = string.digits # Create all possible subpatterns for wildcards. expand_wildcards(base_pattern_normalised, 0) self.template_logger.debug( "Created " + str(len(subpattern_list)) + " subpatterns for wildcard base pattern " + base_pattern_normalised) else: self.template_logger.warning("Not more than " + max_wildcards + " wildcards allowed") return [] else: subpattern_list.append(base_pattern_normalised) else: self.template_logger.warning("Pattern " + base_pattern_normalised + " is invalid") return [] return subpattern_list # Parse xml configuration file for signature definitions. try: self.parse_xml_config(self.signature_file, self.signature_schema) except FwFileNotAvailableError as err: self.template_logger.error("Couldn't access file " + err.file) return False except FwConfigNotValidError as err: self.template_logger.error("Signature file not valid; details: " + err.reason) return False # Create TST tree dictionary self.tst_tree_dict = { } # clear tree dict in case something has changed self.template_logger.info("Creating TST trees") if self.signature_tree_dict and self.signature_info_dict: for frame_value_tag, signature_list in self.signature_tree_dict.items( ): # Create a tree for each tag and add all patterns to it. frame_value_tag_normalised = str( frame_value_tag).upper().replace(' ', '') tst_tree = tst.TST() self.template_logger.debug("Created TST tree for tag " + frame_value_tag_normalised) for signature in signature_list: if self.signature_info_dict[signature][ 'type'] == 'simple': # only simple signatures have patterns try: for frame_value_pattern in self.signature_info_dict[ signature]['patterns'][frame_value_tag][ 'pattern_list']: # add all patterns for the current tag frame_value_pattern_type = self.signature_info_dict[ signature]['patterns'][frame_value_tag][ 'pattern_type'] for subpattern in process_pattern( frame_value_pattern, frame_value_tag_normalised, frame_value_pattern_type): if not tst_tree[subpattern]: self.template_logger.debug( "Created new entry in tst tree for tag " + frame_value_tag_normalised + " with pattern " + subpattern) tst_tree[subpattern] = [ ] # create new pattern entry for tst tree tst_tree[subpattern].append( signature ) # append signature to pattern entry of tree self.template_logger.debug( "Created new entry in tst tree for tag " + frame_value_tag_normalised + " with pattern " + subpattern) self.template_logger.debug( "TST tree for subpattern " + str(subpattern) + ": " + str(tst_tree[subpattern])) except KeyError: self.template_logger.warning( "Couldn't find tag " + frame_value_tag_normalised + " for signature " + signature) # Add finished tree to tree dict self.tst_tree_dict[frame_value_tag_normalised] = tst_tree self.template_logger.info("Created TST tree dictionary") self.template_logger.debug("TST tree dictionary: " + str(self.tst_tree_dict)) self.template_logger.info("Creating signature status dictionary") self.signature_status_dict = {} for signature_name, signature_info in self.signature_info_dict.items( ): # Create status entry for each signature (simple and complex) self.signature_status_dict[signature_name] = { 'hits': 0, 'count': 0, 'logic': '', 'silent': 0, 'invert': 0, 'metasignatures': [] } try: self.signature_status_dict[signature_name][ 'count'] = signature_info[ 'count'] # count is total number of patterns or subsignatures, is set by parser self.signature_status_dict[signature_name][ 'logic'] = signature_info['logic'] self.signature_status_dict[signature_name][ 'silent'] = signature_info['silent'] self.signature_status_dict[signature_name][ 'invert'] = signature_info['invert'] self.signature_status_dict[signature_name][ 'metasignatures'] = signature_info['metasignatures'] except KeyError: self.template_logger.warning( "Couldn't create signature status entry for signature " + signature_name) else: self.template_logger.debug( "Created signatures status entry for signature " + signature_name) self.template_logger.debug("Signature status dictionary: " + str(self.signature_status_dict)) return True
def make_tst( l ) : t = tst.TST() t.bulk_add(l) return t