Пример #1
0
class jieba_api(object):
    def __init__(self):
        print("----------using jieba cut tool---------")

    def init_config(self, config):
        self.config = config
        self.dt = Tokenizer()

    def build_tool(self):
        dict_path = self.config.get("user_dict", None)
        if dict_path is not None:
            import codecs
            with codecs.open(dict_path, "r", "utf-8") as frobj:
                lines = frobj.read().splitlines()
                for line in lines:
                    self.dt.add_word(line, 10000, "<baidu>")

    def cut(self, text):
        words = list(self.dt.cut(text))
        # print(words, " ".join([word for word in words if len(word) >= 1]))
        return " ".join([word for word in words if len(word) >= 1])
Пример #2
0
class JiebaTokenizer(BaseTokenizer):

    def __init__(self):
        file_path = os.path.abspath(__file__)
        file_dir = os.path.dirname(file_path)
        setLogLevel(0)

        self.tokenizer = Tokenizer()
        self.tokenizer.set_dictionary(
            os.path.join(
                file_dir,
                'dict.txt.big.txt'
            )
        )

        specific_tokens = [
            '_url_',
            '_num_',
            '_phone_',
            '_time_'
        ]
        self.add_words(specific_tokens)

    def cut(self, sentence):
        splitted_tokens = self.tokenizer.lcut(sentence)
        while '_' in splitted_tokens:
            splitted_tokens.remove('_')
        return splitted_tokens

    def add_word(self, word, freq=None, tag=None):
        self.tokenizer.add_word(word, freq, tag)
        self.tokenizer.suggest_freq(word, tune=True)

    def add_words(self, words, freq=None, tag=None):
        for word in words:
            self.add_word(word, freq, tag)
Пример #3
0
'''.split('\n')

words = '''
亂丟
垃圾
柏油路面
被罰
低收入戶
為何
目前
'''.split('\n')

tokenizer = Tokenizer('./dict.txt.big.txt')

for word in words:
    tokenizer.add_word(word)


def read_answer(path):
    df = pd.read_csv(path, index_col=0)
    return df['地址'].to_dict()


def read_train_data(path, x_col, y_col):
    df = pd.read_csv(path, index_col=0)
    df.dropna(inplace=True)
    col_freq = \
        df[y_col].value_counts().to_frame() \
        .query('{} > 10'.format(y_col))
    df = df.query('{} in @col_freq.index'.format(y_col))
    size = col_freq[y_col][0]
Пример #4
0
class Movie_Tokenizer:
    SKIP_SPACE_RE = re.compile(r"^\s*$")
    BREAK_SENTENCE_RE = re.compile(r"[。;;.……!!]")
    STOPWORDS = set()

    def __init__(self):
        self.dt = Tokenizer()
        self.dt.initialize()  # 预加载字典,避免界面卡顿
        self.name_dict = {}
        self.reversed_name_dict = {}

        self.text = None
        self._cut_result = []
        self.splited_result = []

    def set_text(self, text):
        text = text.strip()
        if self.text != text:
            self.text = text
            self._split_text()
            self._cache_expired()  # 缓存过期

    def _split_text(self):
        self.splited_result = list(
            self._filter_empty(self.BREAK_SENTENCE_RE.split(self.text)))
        return self.splited_result

    def _filter_empty(self, result):
        return list(
            filterfalse(lambda text: self.SKIP_SPACE_RE.match(text), result))

    def _generate_words_dict(self):
        d = self.name_dict
        res = set(chain.from_iterable(d.values())).union(d.keys())
        return res

    def _cache_expired(self):
        self._cut_result = []

    def cut(self):
        if self._cut_result:
            return self._cut_result
        if not self.splited_result:
            self._split_text()
        words_dict = self._generate_words_dict()
        for word in words_dict:
            self.dt.add_word(word)
        res = map(self.dt.cut, self.splited_result)
        res = list(self._filter_empty(line_cut) for line_cut in res)
        self._cut_result = res
        return res

    def add_name(self, name):
        self.name_dict.setdefault(name, set())
        self._cache_expired()

    def add_alias(self, name, alias):
        self.name_dict[name].add(alias)
        self.reversed_name_dict[alias] = name
        self._cache_expired()

    def get_alias(self, name):
        return self.name_dict[name]

    def get_names(self):
        return set(self.name_dict.keys())

    def del_name(self, name):
        for alias in self.name_dict[name]:
            del self.reversed_name_dict[alias]
        del self.name_dict[name]
        self._cache_expired()

    def del_alias(self, name, alias):
        del self.reversed_name_dict[alias]
        self.name_dict[name].discard(alias)
        self._cache_expired()

    def initialize_tokenizer(self):
        self.dt = Tokenizer()
        self.dt.initialize()
        self._cache_expired()

    def names_by_sentence(self, drop_empty=False):
        cut_result = self.cut()
        words_dict = self._generate_words_dict()
        for line in cut_result:
            # 替换角色名
            word_set = set(
                self.reversed_name_dict.get(word) or word for word in line)
            # 过滤停用词
            word_set_without_stopwords = set(
                filter(lambda word: word not in self.STOPWORDS, word_set))
            # 取剩余结果和角色名字典的交集
            name_set = word_set_without_stopwords & words_dict
            if drop_empty and not name_set:
                continue
            yield name_set

    def co_present(self):
        res = defaultdict(lambda: defaultdict(int))
        for name_set in self.names_by_sentence():
            for name1, name2 in combinations(name_set, 2):
                res[name1][name2] += 1
                res[name2][name1] += 1
        return res

    def word_freq(self):
        word_list = self.cut()
        words_without_stopwords = filterfalse(lambda x: x in self.STOPWORDS,
                                              chain.from_iterable(word_list))
        res = Counter(words_without_stopwords)
        return res

    def import_name_dict(self, name_dict):
        self.name_dict = name_dict
        for name in name_dict:
            for alias in name_dict[name]:
                self.reversed_name_dict.setdefault(alias, name)
        self._cache_expired()

    def import_stopwords(self, filename="edited_baidu_stopwords.txt"):
        self.STOPWORDS = set(
            line.strip()
            for line in open(filename, encoding="utf8").readlines())
        self._cache_expired()

    def apriori(self, min_support=0.01):
        names_by_sentence = list(self.names_by_sentence(drop_empty=True))
        itemsets, rule = apriori(names_by_sentence, min_support=min_support)
        return itemsets