Exemplo n.º 1
0
    def __init__(self):
        self.c_tst = tst.TST()
        c_data = open(college_path)
        c_map_data = dict([x.strip().split(" ") for x in c_data])
        c_data.close()
        for k, v in c_map_data.items():
            self.c_tst.put(k, v)

        self.city_tst = tst.TST()
        self.province_tst = tst.TST()
        city_data = open(city_path)
        city_data2 = [x.strip().split(" ") for x in city_data]
        city_data.close()
        provinces = set([x[0] for x in city_data2])
        province_data = set([l.strip() for l in open(province_path).readlines()])
        provinces = provinces.union(province_data)
        for x in provinces:
            self.province_tst.put(x, x)
        for v in city_data2:
            self.city_tst.put(v[1], v)

        self.country_tst = tst.TST()
        country_data = open(country_path)
        countries = set([l.strip().split(" ")[0] for l in country_data.readlines()])
        country_data.close()
        for c in countries:
            self.country_tst.put(c, c)
Exemplo n.º 2
0
def split_entities(w, entity_words):
    """
    >>> #split_entities("我爱北京天T恤安北京门", ['T恤','北京'])
    ['\\xe6\\x88\\x91\\xe7\\x88\\xb1', '\\xe5\\x8c\\x97\\xe4\\xba\\xac', '\\xe5\\xa4\\xa9', 'T\\xe6\\x81\\xa4', '\\xe5\\xae\\x89', '\\xe5\\x8c\\x97\\xe4\\xba\\xac', '\\xe9\\x97\\xa8']
    """
    t = tst.TST()
    for ew in entity_words:
        t.put(ew, ew)
    results = t.scan(w, tst.TupleListAction())
    return [(x[0], x[1]) for x in results]
 def buildTSTDictionary(self,
                        wordList,
                        saveFile="..\\resources\\code\\arg-dictionary.tst"):
     t = tst.TST()
     for word in wordList:
         t[word] = 1
     try:
         t.write_to_file(saveFile)
         return True
     except IOError:
         return False
Exemplo n.º 4
0
def buildTST():
    t = tst.TST()
    t.bulk_add(corpus)
    return t
 def loadTSTDictionary(self, fileName):
     self.dict = tst.TST()
     try:
         self.dict.read_from_file(fileName)
     except IOError:
         self.dict = None
Exemplo n.º 6
0
def MultiReplaceWithDict(replace_dict):
    t = tst.TST()
    for k, v in replace_dict.iteritems():
        t[k] = v
    return MultiReplaceWithTST(t)
Exemplo n.º 7
0
                unicodedata.name(c)[0:8] == "KATAKANA" or
                unicodedata.name(c)[0:3] == "CJK" or
                unicodedata.name(c)[0:5] == "DIGIT" or
                unicodedata.name(c)[0:5] == "LATIN"):
            return False
    return True

def createKeywordTree(filename):
    """ファイルに登録されている見出し語からキーワードツリーを作成して返す"""
    try:
        fp = codecs.open(filename, "r", "utf-8")
    except IOError, e:
        print e
        return None

    tree = tst.TST()
    for line in fp:
        word = line.rstrip()
        if isValid(word):
            tree[word] = True  # wordをTSTに登録
    fp.close()
    return tree

def analysis(text, tree):
    """textからWikipedia見出し語を抽出する"""
    keywords = []
    result = tree.scan(text, tst.TupleListAction())
    for t in result:
        if t[2] == True:
            try:
                word = unicode(t[0], "utf-8")
Exemplo n.º 8
0
    def template_setup(self):
        """template_setup()
        
        Sets up TST tree for matching.
        A separate tree is built for each distinct frame value
        (e. g. one tree for rtheader matches, one for address matches).
        
        """
        def process_pattern(base_pattern, pattern_tag, pattern_type):
            """process_pattern()
            
            Preprocesses patterns, e. g. normalises pattern and expands wildcards.
            
            """

            max_wildcards = 6
            hexdigits_uppercase = '0123456789ABCDEF'
            subpattern_list = []
            wildcard_alphabet = hexdigits_uppercase  # wildcard alphabet defults to hex characters

            def check_pattern(pattern, tag):
                """check_pattern()
                
                Checks if pattern has correct structure.
                
                """

                if tag == 'ADDR1' or tag == 'ADDR2' or tag == 'ADDR3' or tag == 'ADDR4':
                    if len(pattern) != 12:
                        self.template_logger.warning("Length of address " +
                                                     pattern + " invalid")
                        return False
                    elif not pattern.replace('*', '').isalnum():
                        self.template_logger.warning(
                            "Address " + pattern +
                            " contains invalid characters")
                        return False
                elif tag == 'FTYPE' or tag == 'FSUBTYPE':
                    if not pattern.replace('*', '').isalpha():
                        self.template_logger.warning(
                            "Type or subtype " + pattern +
                            " contains invalid characters")
                        return False
                return True

            def expand_wildcards(subpattern, wildcard_index):
                """expand_wildcards()
                
                Is called recursively if a wildcard is encountered.
                Creates a list of all possible subpatterns.
                
                """

                wildcard_index = subpattern.find('*', wildcard_index)
                if wildcard_index < 0:
                    subpattern_list.append(subpattern)
                    return True
                else:
                    for wildcard_char in wildcard_alphabet:
                        subsubpattern = subpattern.replace(
                            subpattern[wildcard_index], wildcard_char, 1
                        )  # replace the first wildcard with the next character in substitution alphabet
                        if wildcard_index == subpattern.rfind('*'):
                            subpattern_list.append(subsubpattern)
                        else:
                            expand_wildcards(subsubpattern, wildcard_index)

            # Preprocess pattern
            base_pattern_normalised = str(base_pattern).upper().replace(
                ' ', '')
            if check_pattern(base_pattern_normalised, pattern_tag):
                if '*' in base_pattern_normalised:
                    if base_pattern_normalised.count('*') <= max_wildcards:
                        # set alphabet to substitute wildcards with.
                        if pattern_type == 'word':
                            wildcard_alphabet = string.ascii_letters + string.digits + ' ' + string.punctuation.replace(
                                '*', ''
                            )  # wildcard is not inserted, would result in infinite loop
                        elif pattern_type == 'bit':
                            wildcard_alphabet = '01'
                        elif pattern_type == 'decimal':
                            wildcard_alphabet = string.digits
                        # Create all possible subpatterns for wildcards.
                        expand_wildcards(base_pattern_normalised, 0)
                        self.template_logger.debug(
                            "Created " + str(len(subpattern_list)) +
                            " subpatterns for wildcard base pattern " +
                            base_pattern_normalised)
                    else:
                        self.template_logger.warning("Not more than " +
                                                     max_wildcards +
                                                     " wildcards allowed")
                        return []
                else:
                    subpattern_list.append(base_pattern_normalised)
            else:
                self.template_logger.warning("Pattern " +
                                             base_pattern_normalised +
                                             " is invalid")
                return []
            return subpattern_list

        # Parse xml configuration file for signature definitions.
        try:
            self.parse_xml_config(self.signature_file, self.signature_schema)
        except FwFileNotAvailableError as err:
            self.template_logger.error("Couldn't access file " + err.file)
            return False
        except FwConfigNotValidError as err:
            self.template_logger.error("Signature file not valid; details: " +
                                       err.reason)
            return False
        # Create TST tree dictionary
        self.tst_tree_dict = {
        }  # clear tree dict in case something has changed
        self.template_logger.info("Creating TST trees")
        if self.signature_tree_dict and self.signature_info_dict:
            for frame_value_tag, signature_list in self.signature_tree_dict.items(
            ):
                # Create a tree for each tag and add all patterns to it.
                frame_value_tag_normalised = str(
                    frame_value_tag).upper().replace(' ', '')
                tst_tree = tst.TST()
                self.template_logger.debug("Created TST tree for tag " +
                                           frame_value_tag_normalised)
                for signature in signature_list:
                    if self.signature_info_dict[signature][
                            'type'] == 'simple':  # only simple signatures have patterns
                        try:
                            for frame_value_pattern in self.signature_info_dict[
                                    signature]['patterns'][frame_value_tag][
                                        'pattern_list']:  # add all patterns for the current tag
                                frame_value_pattern_type = self.signature_info_dict[
                                    signature]['patterns'][frame_value_tag][
                                        'pattern_type']
                                for subpattern in process_pattern(
                                        frame_value_pattern,
                                        frame_value_tag_normalised,
                                        frame_value_pattern_type):
                                    if not tst_tree[subpattern]:
                                        self.template_logger.debug(
                                            "Created new entry in tst tree for tag "
                                            + frame_value_tag_normalised +
                                            " with pattern " + subpattern)
                                        tst_tree[subpattern] = [
                                        ]  # create new pattern entry for tst tree
                                    tst_tree[subpattern].append(
                                        signature
                                    )  # append signature to pattern entry of tree
                                    self.template_logger.debug(
                                        "Created new entry in tst tree for tag "
                                        + frame_value_tag_normalised +
                                        " with pattern " + subpattern)
                                    self.template_logger.debug(
                                        "TST tree for subpattern " +
                                        str(subpattern) + ": " +
                                        str(tst_tree[subpattern]))
                        except KeyError:
                            self.template_logger.warning(
                                "Couldn't find tag " +
                                frame_value_tag_normalised +
                                " for signature " + signature)
                # Add finished tree to tree dict
                self.tst_tree_dict[frame_value_tag_normalised] = tst_tree
            self.template_logger.info("Created TST tree dictionary")
            self.template_logger.debug("TST tree dictionary: " +
                                       str(self.tst_tree_dict))
            self.template_logger.info("Creating signature status dictionary")
            self.signature_status_dict = {}
            for signature_name, signature_info in self.signature_info_dict.items(
            ):
                # Create status entry for each signature (simple and complex)
                self.signature_status_dict[signature_name] = {
                    'hits': 0,
                    'count': 0,
                    'logic': '',
                    'silent': 0,
                    'invert': 0,
                    'metasignatures': []
                }
                try:
                    self.signature_status_dict[signature_name][
                        'count'] = signature_info[
                            'count']  # count is total number of patterns or subsignatures, is set by parser
                    self.signature_status_dict[signature_name][
                        'logic'] = signature_info['logic']
                    self.signature_status_dict[signature_name][
                        'silent'] = signature_info['silent']
                    self.signature_status_dict[signature_name][
                        'invert'] = signature_info['invert']
                    self.signature_status_dict[signature_name][
                        'metasignatures'] = signature_info['metasignatures']
                except KeyError:
                    self.template_logger.warning(
                        "Couldn't create signature status entry for signature "
                        + signature_name)
                else:
                    self.template_logger.debug(
                        "Created signatures status entry for signature " +
                        signature_name)
            self.template_logger.debug("Signature status dictionary: " +
                                       str(self.signature_status_dict))
        return True
Exemplo n.º 9
0
def make_tst( l ) :
    t = tst.TST()
    t.bulk_add(l)
    return t