예제 #1
0
def extract_langlinks(sql, fo):
    total = 0
    category = 0
    instance = 0
    template = 0

    o = open(fo, 'w')

    with open(sql) as f:
        for line in f:
            if line.startswith('INSERT'):
                line = line[line.index('('):]
                line = line.strip('\n').strip(';').strip(')').strip('(') #删除前后的(和)
                for tri in line.split('),('): #以),(分割,得出每个item
                    tri = tri.replace("'",'').replace("'",'') 
                    _id, lan, link = tri.split(',',2) #因为link里可能有逗号,要限制只分2次
                    if lan == 'zh':
                        total += 1
                        if link.startswith('Category:'):
                            category += 1
                        if link.startswith('Template:'):
                            template += 1
                            print _id, HanziConv.toSimplified(link).encode('utf-8')
                        link = link.replace('_', ' ')
                        o.write('%s\t%s\n'%(_id,HanziConv.toSimplified(link).encode('utf-8')))
    
    instance = total - category - template
    print "Total:%d, Category:%d, Instance:%d, Template:%d"%(total, category, instance, template)
예제 #2
0
 def replace(x):
     x = x.replace('"', "").replace("\r\n",
                                    " ").replace("\n",
                                                 " ").replace(",", ",")
     x = HanziConv.toSimplified(x)
     x = [a for a in cut(x) if a not in stop_words]
     x = " ".join(x)
     return x
예제 #3
0
def clean(text):
    text = text.strip()
    # text = text.lower()
    text = HanziConv.toSimplified(text)
    text = full2half(text)
    text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text)
    # text = re.sub("\s*", "", text)
    return text
예제 #4
0
 def get_people_name(self):
     if self.get_main_content() != None:
         term_list = segment.seg(
             HanziConv.toSimplified(self.get_main_content()))
         for term in term_list:
             if str(term.nature) == NLP_Constant.people_name_pos:
                 return HanziConv.toTraditional(str(term.word))
     return None
def to_S(k):
    txt=X.content[k].strip()
    txt = re.sub('\t|\r', '\n',txt)
    txt = txt.replace('\n\n', '\n')
    txt = re.sub('  |\u3000', ' ', txt)
    txt=HanziConv.toSimplified(txt)
    txt=txt.strip()
    return (X.shop_url[k], X.post_time[k],txt,int(X.score[k]),len(txt))
예제 #6
0
def generate_qimai_addition_dataset(model_type="bert"):
    test_df = pickle_load(path_cache / "test_df.pkl")
    qimai_test_id = pickle_load(path_cache / f"{model_type}_qimai_test_id.pkl")
    appname2appdesc = pickle_load(path_cache / "appname2appdesc.pkl")
    apkname2appdesc = pickle_load(path_cache / "apkname2appdesc.pkl")

    test_df["appname"] = test_df["new_appname"]
    qimai_test_df = test_df.merge(apkname2appdesc)
    qimai_test_df = qimai_test_df[["appname", "app_desc"]]
    chusai_test_df = pickle_load(path_cache / "chusai_test_df.pkl")
    chusai_test_df = chusai_test_df.loc[~chusai_test_df["appname"].isna(),
                                        ["appname", "app_desc"]]

    appname2appdesc = pd.concat(
        [appname2appdesc, chusai_test_df, qimai_test_df], axis=0, sort=False)
    appname2appdesc["desc_len"] = appname2appdesc["app_desc"].str.replace(
        "[\x00-\xff”“•]", "").str.len()
    appname2appdesc["appname"] = appname2appdesc["appname"].str.lower(
    ).str.replace(" ", "")
    appname2appdesc["appname"] = [
        HanziConv.toSimplified(x) for x in appname2appdesc["appname"]
    ]
    appname2appdesc = appname2appdesc.sort_values("desc_len").drop_duplicates(
        "appname", keep="last")
    appname2appdesc = appname2appdesc.loc[appname2appdesc["desc_len"] >= 8]

    test_df_new = test_df.copy()
    test_df_new = test_df_new.loc[~test_df["id"].isin(qimai_test_id)]
    test_df_new["appname"] = test_df_new["appname"].str.lower().str.replace(
        " ", "")
    test_df_new["appname"] = [
        HanziConv.toSimplified(x) for x in test_df_new["appname"]
    ]
    test_df_new = test_df_new.merge(appname2appdesc)

    qimai_addition_test_id = test_df_new["id"].tolist()
    qimai_addition_test_dataset = generate_tensor_data(test_df_new["app_desc"],
                                                       model_type)
    qimai_addition_test_dataset = TensorDataset(*qimai_addition_test_dataset)

    pickle_save(qimai_addition_test_id,
                path_cache / "qimai_addition_test_id.pkl")
    pickle_save(
        qimai_addition_test_dataset,
        path_tensor_dataset / f"{model_type}_qimai_addition_test_dataset.pkl")
예제 #7
0
 def __iter__(self):
     for content, (page_id, title) in self.wiki.get_texts():
         yield doc2vec.LabeledSentence(
             # 1. 对content中的每一个c,
             # 2. 转换成简体中文之后用jieba分词
             # 3. 加入到words列表中
             words=[w for c in content
                    for w in jieba.cut(HanziConv.toSimplified(c))],
             tags=[title])
예제 #8
0
def normalize(text):
    toSim = HanziConv.toSimplified(text.replace('\n', ' '))
    t2 = unicodedata.normalize('NFKC', toSim)
    table = {
        ord(f): ord(t)
        for f, t in zip(u',。!?【】()%#@&1234567890', u',.!?[]()%#@&1234567890')
    }
    t3 = t2.translate(table)
    return t3
예제 #9
0
파일: data.py 프로젝트: zyunnn/nlp_toolkit
 def clean(self, line):
     line = re.sub(r'\[[\u4e00-\u9fa5a-z]{1,4}\]|\[aloha\]', '', line)
     line = re.sub(EMOJI_UNICODE, '', line)
     line = re.sub(self.html_texts, '', line)
     if re.search(r'[\u4300-\u9fa5]+', line):
         line = HanziConv.toSimplified(line)
         return re.sub(' {2,}|\t', ' ', line).lower()
     else:
         return None
예제 #10
0
 def len_tokenizer(self, input):
     # 繁体转简体
     text = HanziConv.toSimplified(input)
     # 分词
     text = jieba.lcut(text)
     # 去除停用词
     if self._stopwordset:
         text = self.movestopwords(text)
     return len(text)
def clear_text(x):
    x = BeautifulSoup(x, 'html.parser').text
    x = html.unescape(x)
    x = HanziConv.toSimplified(x)
    x = re.sub(r'\s+', '', x) # tab \n 去除 匹配任何非空白字符。等价于 [^ \f\n\r\t\v]。
    x = re.sub(r'[\((【](.*?)[\))】]', '', x) # 替换 ()是单元, ?非贪婪匹配,否则.*把后面的括号也匹配掉了
    x = re.sub(r'([–-—=…]*)', '',x)
    x = x.strip()
    return x
예제 #12
0
def terms2VecIDs(terms):
    ans = []
    for term in terms:
        ID = word2id.get(HanziConv.toSimplified(term)) #Problem: Some terms are not pretrained, like '食记','咖哩','捷运'
        if ID == None:
            ans.append(0)
        else:
            ans.append(ID)
    return ans
예제 #13
0
def main(args):
    with open(args.input, encoding='utf8') as f:
        lines = f.read().splitlines()
        if args.format == 'lines':
            lines.append('<song>')
        tot = len(lines)
    parsed_line = []
    thu1 = thulac.thulac(seg_only=True)
    with open(args.output, encoding='utf8', mode='w') as f:
        cnt = 0
        for line in lines:
            if args.format == 'lines':
                line = HanziConv.toSimplified(line)
            if cnt % 100 == 0:
                print('status: %d/%d' % (cnt, tot))
            cnt += 1
            if line == '<song>':
                if len(parsed_line) == 0:
                    continue
                n = len(parsed_line)
                # 控制每句总长度为maxlen
                for i in range(n):
                    l = len(parsed_line[i])
                    if l > args.maxlen:
                        continue
                    ctrl_list = parsed_line[i]
                    for k in range(i + 1, n + 1):
                        if k == n or l + len(parsed_line[k]) + 1 > args.maxlen:
                            f.write(' '.join(ctrl_list) + '\n')
                            break
                        ctrl_list.append('<lbreak>')
                        ctrl_list += parsed_line[k]
                        l += len(parsed_line[k]) + 1
                parsed_line = []
                continue
            # 用thulac或jieba进行分词
            if args.segment == 0:
                seg_list = jieba.lcut(line)
            else:
                seg_list = thu1.cut(line)
                seg_list = [t[0] for t in seg_list]
            seg_list2 = []
            for word in seg_list:
                seg_list2 += parse_segged_word(word)
            seg_list = seg_list2
            if args.segment == 0:
                seg_list2 = []
                for word in seg_list:
                    if word == '<num>':
                        seg_list2.append(word)
                    else:
                        seg_list2 += list(word)
                seg_list = seg_list2
            if len(seg_list) > 0:
                parsed_line.append(seg_list)
    print('Finished')
예제 #14
0
def terms2Vec(terms):
    vec = np.zeros(len(embeddings[0]))
    for term in terms:
        ID = word2id.get(HanziConv.toSimplified(term)) #Problem: Some terms are not pretrained, like '食记','咖哩','捷运'
        if ID == None:
            vec += embeddings[0]
        else:
            vec += embeddings[ID]
    vec /= len(terms)
    return vec
예제 #15
0
def terms2Vec(terms):
    vec = np.zeros(len(embeddings[0]))
    for term in terms:
        ID = word2id.get(HanziConv.toSimplified(term)) 
        if ID == None:
            vec += embeddings[0]
        else:
            vec += embeddings[ID]
    vec /= len(terms)
    return vec
예제 #16
0
 def chinese_tokenizer(self, documents):
     for document in documents:
         # 繁体转简体
         text = HanziConv.toSimplified(document)
         # 分词
         text = jieba.lcut(text)
         # 去除停用词
         if self._stopwordset:
             text = self.movestopwords(text)
         yield text
예제 #17
0
파일: preprocess.py 프로젝트: zjatc/ANNTOSA
def process_chinese_data(line, use_target, use_first_target):
    text = line['text']
    target = line['target']
    tar_idx = line['indices']
    label = line['label']
    words_text = text.split()

    tar_idx_list = []
    tokenized_text = []
    found_target = False
    words_text = [HanziConv.toSimplified(word).lower() for word in words_text]
    if use_target == 'token':
        for idx in tar_idx:
            words_text = [
                TARGET if str(i) in idx else word
                for i, word in enumerate(words_text)
            ]
        tokenized_text.extend(words_text)
        tar_idx_list = [1 if word == TARGET else 0 for word in tokenized_text]
    else:
        norm_target = [HanziConv.toSimplified(target).lower()]
        last_tar_end_idx = 0
        for idx in tar_idx:
            tar_start_idx = int(idx[0])
            if tar_start_idx != 0:
                norm_non_target_words = words_text[
                    last_tar_end_idx:tar_start_idx]
                tokenized_text.extend(norm_non_target_words)
                tar_idx_list.extend([0] * len(norm_non_target_words))
            tokenized_text.extend(norm_target)
            if use_first_target and found_target:
                tar_idx_list.extend([0] * len(norm_target))
            else:
                tar_idx_list.extend([1] * len(norm_target))
                found_target = True
            last_tar_end_idx = tar_start_idx + 1

        if last_tar_end_idx < len(words_text) - 1:
            norm_non_target_words = words_text[last_tar_end_idx:]
            tokenized_text.extend(norm_non_target_words)
            tar_idx_list.extend([0] * len(norm_non_target_words))

    return tokenized_text, target, tar_idx_list, label
예제 #18
0
def chinese_tokenizer(documents):
    """
    把中文文本转为词序列
    繁体转简体、英文转小写
    """

    for document in documents:
        text = HanziConv.toSimplified(document)
        text = text.lower()
        yield list(cut(text))
예제 #19
0
def insert_example(c, definition_id, starting_example_id, example):
    # The example should be a list of Example objects, such that
    # the first item is the 'source', and all subsequent items are the
    # translations
    examples_inserted = 0

    trad = example[0].content
    simp = HanziConv.toSimplified(trad)
    jyut = example[0].pron
    pin = ""
    lang = example[0].lang

    example_id = database.insert_chinese_sentence(c, trad, simp, pin, jyut,
                                                  lang, starting_example_id)

    # Check if example insertion was successful
    if example_id == -1:
        if trad == "X" or trad == "x":
            # Ignore examples that are just 'x'
            return 0
        else:
            # If insertion failed, it's probably because the example already exists
            # Get its rowid, so we can link it to this definition
            example_id = database.get_chinese_sentence_id(
                c, trad, simp, pin, jyut, lang)
            if example_id == -1:  # Something went wrong if example_id is still -1
                return 0
    else:
        examples_inserted += 1

    database.insert_definition_chinese_sentence_link(c, definition_id,
                                                     example_id)

    for translation in example[1:]:
        sentence = translation.content
        lang = translation.lang

        # Check if translation already exists before trying to insert
        # Insert a translation only if the translation doesn't already exist in the database
        translation_id = database.get_nonchinese_sentence_id(c, sentence, lang)

        if translation_id == -1:
            translation_id = starting_example_id + examples_inserted
            database.insert_nonchinese_sentence(c, sentence, lang,
                                                translation_id)
            examples_inserted += 1

        # Then, link the translation to the example only if the link doesn't already exist
        link_id = database.get_sentence_link(c, example_id, translation_id)

        if link_id == -1:
            database.insert_sentence_link(c, example_id, translation_id, 1,
                                          True)

    return examples_inserted
예제 #20
0
def get_download_url(name, ep, keyword, translation_team, **dict):
    """
    Search download url in dmhy.org
    """
    root_url = 'https://share.dmhy.org'
    payload = {'keyword': keyword + ' ' + '{:0>2}'.format(ep)}
    user_agent = {
        'User-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML,'
                      'like Gecko) Chrome/41.0.2228.0 Safari/537.36'
    }
    print('DMHY scraper is searching for {} of {}'.format(ep, name))
    content = net.request_get_content(root_url + '/topics/list',
                                      retry=5,
                                      params=payload)
    soup = bs4.BeautifulSoup(content, 'lxml')
    trs = soup.find_all('tr')
    if len(trs) == 0:
        raise FileNotFoundError
    found_flag = False
    download_url = ''
    unified_name = name.lower()
    print('Unified name:{}'.format(unified_name))
    # Skip the table header
    for tr in trs[1:]:
        a = tr.select('td.title > a')[0]
        # Check the correctness of entry
        entry_desc = ''
        for string in a.strings:
            entry_desc += string
        # Eliminating spaces
        entry_desc = HanziConv.toSimplified(entry_desc.strip())
        try:
            print('Searching: {0}'.format(entry_desc))
        except:
            print('Experiencing encoding problem, but search is still going on.')
            print('Searching:', entry_desc.encode('utf-8'))
        unified_entry_desc = entry_desc.lower()
        if unified_name in unified_entry_desc:
            # Translation team check
            if (translation_team != []
                    and not any(trans_t.lower() in unified_entry_desc for trans_t in translation_team)):
                continue
            download_page_url = a['href']
            print('download_page link:{0}'.format(download_page_url))
            download_page_content = net.request_get_content(
                root_url + download_page_url,
                retry=5)
            soup1 = bs4.BeautifulSoup(download_page_content, 'lxml')
            url_list = soup1.find(id='tabs-1')
            p = url_list.find('p')
            download_url = p.find('a')['href']
            break
    if download_url == '':
        raise FileNotFoundError
    return "https:" + download_url
예제 #21
0
def add_edge(dict_re_match_object):
    """ upload edge created from regular expression matched object.
    (9,'En-3_使用者','MOUNTAIN','2015-09-02 13:44:06','','uppercase','page')
    Keyword Arguments:
    re_match_object -- re object
    """
    # iterate nodes batch
    for index, value in dict_re_match_object.items():
        if value is not None:
            item = dict_re_match_object.get(index)
            edge_type = item.group(7)[1:-1]
            if edge_type == 'page':
                page_title = item.group(3)[1:-1]
                cat_title = item.group(2)[1:-1]
                if '\\n' in cat_title:
                    end = cat_title.split("\\n")
                    cat_title = end[-1]
                if '\\n' in page_title:
                    end = page_title.split("\\n")
                    page_title = end[-1]
                page_title = page_title.replace(" ", "_")
                # page subtype is 0
                page_title = HanziConv.toSimplified(page_title)
                cat_title = HanziConv.toSimplified(cat_title)
                graph.add_edge(cat_title, page_title, subtype=0)
            if edge_type == 'subcat':
                subcat_title = item.group(3)[1:-1]
                cat_title = item.group(2)[1:-1]
                if '\\n' in cat_title:
                    end = cat_title.split("\\n")
                    cat_title = end[-1]
                if '\\n' in subcat_title:
                    end = subcat_title.split("\\n")
                    subcat_title = end[-1]
                subcat_title = subcat_title.replace(" ", "_")
                if subcat_title == cat_title:
                    continue
                # subcategory subtype is 1
                subcat_title = HanziConv.toSimplified(subcat_title)
                cat_title = HanziConv.toSimplified(cat_title)
                graph.add_edge(cat_title, subcat_title, subtype=1)
                g.addEdge(cat_title, subcat_title)
예제 #22
0
    def simplify_text(self):
        """
        Simplifies text input into simplified Chinese characters in preparation for segmentation. For reasons why
        simplification is done first, check documentation for method 'segment_text'.

        Returns:
            text_sim (str): A string containing simplified Chinese characters, obtained by simplifying text_input using hanziconverter.
        """
        text_filtered = self.filter_text()

        return hanzC.toSimplified(text_filtered)
def expression_process(text):
    # print(text)
    strings = acp.expression_extract(text)
    res = []
    for i in strings:
        if i[1] is 'str':
            string = HanziConv.toSimplified(i[0]).lower()
            res += clean_stopwords(list(jieba.cut(string)), stopwords_new)
        else:
            res += [i[0]]
    return res
예제 #24
0
파일: wiki_word.py 프로젝트: gchange/ogeek
def process(inqueue, outqueue):
    while True:
        line = inqueue.get()
        if line is None:
            break
        words = [HanziConv.toSimplified(w) for w in line]
        words = [w2 for w1 in words for w2 in jieba.cut(w1, cut_all=False)if len(w2) > 1]
        words = [w.encode('utf-8') for w in words]
        text = b' '.join(words)
        outqueue.put(text)
    return
예제 #25
0
def add_node(dict_re_match_object):
    """ upload node created from regular expression matched object.
    (6,'深圳证券交易所上市公司',13,0,0)
    Keyword Arguments:
    re_match_object -- re object
    """
    # iterate nodes batch
    for index, value in dict_re_match_object.items():
        if value is not None:
            item = dict_re_match_object.get(index)
            graph.add_node(HanziConv.toSimplified(item.group(2)[1:-1]))
예제 #26
0
def get_titles(file_path):
    titles = set()
    with gzip.open(file_path, 'rb') as f:
        for line in f:
            title = line.decode('utf-8').rstrip('\n')
            for c in title:
                if u'\u4e00' <= c <= u'\u9fff':
                    titles.add(HanziConv.toSimplified(title))
                    break
            titles.add(title)
    return titles
예제 #27
0
def mmtv_genre(soup):
    genres = ""
    if soup.find("span", {"class": "posts-inner-details-text-under"}):
        soup = soup.find("span", {"class": "posts-inner-details-text-under"})
        genres_mmtv = soup.find_all("span")
        genres = []
        for genre in genres_mmtv:
            genre = HanziConv.toSimplified(str(genre.text))
            genre = get_genre_type(genre)
            genres.append(genre)
    return genres
예제 #28
0
    def __init__(self, url):
        self.url = url
        try:
            res = requests.get(url, timeout=3)
            self.page = BeautifulSoup(res.text, "html.parser")
            self.timeout = False
        except requests.exceptions.RequestException as e:
            errorLogger.exception(f"Error handling web page:{self.url}" +
                                  " -- request timeout")
            self.page = BeautifulSoup(open("emptyPage.html", encoding="utf8"),
                                      "html.parser")
            self.timeout = True
        self.isSong = url.startswith("https://mojim.com/cny")
        if self.isSong:
            try:
                text = self.page.find(id="fsZx1")
                texts = text.prettify().split("\n")
                # store processed and filtered lyric
                filtered = []
                for s in texts:
                    s = s.strip()
                    if (s == "<ol>"):
                        break
                    if not s.startswith("<") and s.find(":") == -1 and s.find(
                            "※") == -1 and not s.startswith("["):
                        filtered.append(s)

                self.artistName = HanziConv.toSimplified(filtered.pop(0))
                self.songName = HanziConv.toSimplified(filtered.pop(0))
                s = ""
                for t in filtered:
                    s = s + t + os.linesep
                # return a string representation of the lyric. each line is separated by
                # \n
                self.lyric = HanziConv.toSimplified(s)
            except:
                self.isSong = False
                errorLogger.exception(
                    f"Error handling web page:{self.url}\n\t{self.page.text}")

        self.relatedPages = self.relatedSongs()
예제 #29
0
def get_info(app: str = "Spotify") -> Tuple[str, str, float, str, float]:
    """
    Get information about the track in play.

    Parameters
    ----------
    app : str, optional
        Name of the app, by default "Spotify"

    Returns
    -------
    Tuple[str, str, float, str, float]
        Title, artists, position, status, duration of the track

    Examples
    --------
    >>> title, artists, position, status, duration = get_info("Spotify")
    >>> status in [None, "playing", "paused"]
    True
    """

    template = f"""
    on run
        if application "{app}" is running then
            tell application "{app}"
                set currentInfo to {{name of current track, "|", artist of current track, "|", player position, "|", player state, "|", duration of current track}}
            end tell
        end if
        return currentInfo
    end run
    """

    code, res, error = osascript.run(template, background=False)
    title = artists = position = status = duration = None
    if code == 0:
        segments = res.split("|")
        title, artists, position, status, duration = map(
            lambda x: x.strip(' ,"'), segments)
        if all(x is not None
               for x in [title, artists, position, status, duration]):
            position = float(position)
            duration = float(duration)
            if duration <= 1200:
                duration = duration
            else:
                duration /= 1000
            title = HanziConv.toSimplified(title)
            title = re.sub(r"[[《<((【「{].*?[]】)」}>)》]", "", title)
            title = title.rsplit("-", 1)[0]
    else:
        logger.debug(error)

    return title, artists, position, status, duration
예제 #30
0
    def normalize(self, ss):
	# 繁简转换
	ss1 = HanziConv.toSimplified(ss)
	# 大小写转换
	ss2 = ss1.lower()
	# 过滤出 中文 字母 数字 其余字符用空格表示
	ss3 = self.char_filter.sub(r' ', ss2)
	# 多个空格合并
	ss4 = ' '.join(ss3.strip().split())
	if not isinstance(ss4, str):
	    ss4 = ss4.encode('utf8')
	return ss4
예제 #31
0
파일: sent_tools.py 프로젝트: syuoni/tools
def sent_extract(sent):
    sent = HC.toSimplified(sent)

    extracted = []
    for pt in [emoji_pt, link_pt, share_pt1, share_pt2]:
        ex = ';'.join(pt.findall(sent))
        if ex:
            sent = pt.sub('', sent)
        extracted.append(ex)

    sent = remove_pt.sub('', sent).strip()
    return sent, extracted
예제 #32
0
파일: pre.py 프로젝트: JohanyCheung/fsauor
def seg(x):
    x = HanziConv.toSimplified(x)
    x = re.sub('\x05|\x06|\x07|\.\.|\.\.\.', ' ', x)
    #w = posseg.cut(x.upper())
    #w = [word for word, flag in w if word not in stopwords and flag in keep_property]
    w = jieba.cut(x.upper())
    w = [
        word.strip() for word in w
        if word not in stopwords and len(word.strip()) > 0
    ]
    #w=[word.strip() for word in w if len(word)>0]
    return ' '.join(w)
예제 #33
0
def chinese_tokenizer(documents):
    """
    把中文文本转为词序列
    """

    for document in documents:
        # 繁体转简体
        text = HanziConv.toSimplified(document)
        # 英文转小写
        text = text.lower()
        # 分词
        yield list(cut(text))
예제 #34
0
def get_char_list(query):
    query = HanziConv.toSimplified(query.strip())
    regEx = re.compile('[\\W]+')  # 我们可以使用正则表达式来切分句子,切分的规则是除单词,数字外的任意字符串
    res = re.compile(r'([\u4e00-\u9fa5])')  # [\u4e00-\u9fa5]中文范围
    sentences = regEx.split(query.lower())
    str_list = []
    for sentence in sentences:
        if res.split(sentence) == None:
            str_list.append(sentence)
        else:
            ret = res.split(sentence)
            str_list.extend(ret)
    return [w for w in str_list if len(w.strip()) > 0]
예제 #35
0
def remove_sepcail_segment(content, jieba_stop_words):
    content = HanziConv.toSimplified(content)
    seg_list = jieba.cut(content)
    seg_clean = []
    
    for word in seg_list:
        clean_word = getChinese(word).strip()
        if  clean_word== '':
            continue
        seg_clean.append(clean_word)
    
    seg_clean = [word for word in seg_clean if not word in jieba_stop_words]
    return ','.join(seg_clean)
예제 #36
0
    def segment2( self, sent ):
        ssent = HanziConv.toSimplified( sent )
        
        res = self.segmenter.segment( ssent )

        arr = []
        
        start = 0
        for i in range( res.size() ):
            length = len(res.get(i))
            arr.append( sent[start:start+length] )
            start += length

        return arr
예제 #37
0
def convert_to_simplified(text):
    if u'歷' in text:
        text = text.replace(u'歷', u'历')
    return HanziConv.toSimplified(text)
def traditional_to_simplified(ustring):
    return HanziConv.toSimplified(ustring)
예제 #39
0
def simplify_or_none(text):
    if text is None:
        return None
    else:
        return HanziConv.toSimplified(text)
예제 #40
0
#coding:utf-8
from hanziconv import HanziConv

stop_file = open("./other_data/stop_word.txt", 'r')
stop_word_array = []
for line in stop_file:
    temp = line.replace("\n", "")
    temp = HanziConv.toSimplified(temp)
    if temp not in stop_word_array:
        stop_word_array.append(temp)

stop_file1 = open("./generated_data/stop_word_final.txt", "w")
for i in stop_word_array:
    stop_file1.write(i.encode('utf8')+"\n")
예제 #41
0
def chinese_tokenizer(s, lower=True):
    s = unicode(s)
    if lower:
        s = hanzi.toSimplified(s)
    return [t[0] for t in jieba_tokenize(s)]
예제 #42
0
def simplified_eq(a, b):
    return len(a) == len(b) and \
           HanziConv.toSimplified(a[0]) == \
           HanziConv.toSimplified(b[0])
예제 #43
0
    # 草 ("grass"), "肏" is the actual character.  "艹" is not a real character
    # but it's used this way
    "操你", "草你", "日你",  # f**k you
    "操他", "草他", "日他",  # f**k his
    "操她", "草她", "日她",  # f**k her

    # Discrimination (racial slurs)
    "小日本",  # little Japanese
    "台湾狗",  # Taiwanese dogs
    "共产中国",  # communist Chinese
    "流氓国家",  # rogue country
    "人渣",  # human slag
    "我去",  # this is verbal and bad
    "鬼子"  # devil, usually a suffix
]
BAD = [HanziConv.toSimplified(word) for word in bad_init] + \
      [HanziConv.toTraditional(word) for word in bad_init]

INFORMAL = [
    # Hello
    "你好",  # nǐ hǎo; The standard "hello" greeting.
    "您好",  # nín hǎo; The same "hello" greeting as above
    "你怎么样",  # nǐ zěnmeyàng?; "What's up?", "How are you doing?"

    # Good afternoon
    "午安",  # wǔ'an; note: seldom used in the Mainland.
    "下午好",  # xìawǔ hǎo! Seldom used in the Republic of China

    # Good evening / Good night
    "晚安",  # wǎn'an; Literally "Peace at night", Good night.
    "晚上好",  # wǎnshang hǎo; Good evening!
        m = re.search(ur"^(\[.+?\])(.+?):", s)
        if m:
            s = m.group(2) + m.group(1)
        else:
            m = re.search(ur"^\[.+?\](.*)", s)
            if m:
                s = m.group(1)
    return s

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()

    parser.add_argument("input", action="store", nargs = 1)
    parser.add_argument("output", action="store", nargs = 1)
    parser.add_argument("--encoding", action="store", default="utf_8_sig", nargs=1)
    parser.add_argument("--traditional", action="store_true", default=False)
    args = parser.parse_args()

    buf = codecs.open(args.input[0], "rb", args.encoding).read()

    if args.traditional:
        buf = HanziConv.toTraditional(buf)
    else:
        buf = HanziConv.toSimplified(buf)

    lines = buf.split("\n")
    lines.sort(key = sort_func)
    codecs.open(args.output[0], "wb", args.encoding).writelines(lines)
예제 #45
0
dic_postive ={}
dic_negative = {}
dic_term_orientation = {}
pos = 0.0
neg = 0.0
oth = 0.0
reader = csv.reader(open("./generated_data/training_file.csv", "rb"))
for row in reader:
    if row[1] == "1":
        pos += 1
    elif row[1] == "0":
        neg += 1
    elif row[1] == "2":
        oth += 1
    flag = row[1]
    temp = HanziConv.toSimplified(row[3])
    words = jieba.cut(temp, cut_all=False)
    word_is_counted = []
    for w in words:
        if w not in word_is_counted:
            word = w.encode('utf8')
            if (word not in punctuation and word not in stop_word_list) and only_nonascii(word) != "":
                if flag == '1':
                    if word not in dic_postive:
                        dic_postive[word] = 2
                    else:
                        dic_postive[word] += 1
                    if word not in dic_negative:
                        dic_negative[word] = 1
                elif flag == '0':
                    if word not in dic_negative:
예제 #46
0
#     [dic_TW, dic_HK, dic_CN] = mdic()
#     str_TW = conv(a, dic_TW)
#     str_HK = conv(c, dic_HK)
#     str_CN = conv(b, dic_CN)
# print a, ' <-> ', str_TW, '\n', c, ' < -> ', str_HK, '\n', b, ' < -> ',
# str_CN


def check_contain_chinese(check_str):
    for ch in check_str:
        if u'\u4e00' <= ch <= u'\u9fff':
            return True
    return False

if __name__ == '__main__':
    fin = codecs.open("zhwiki-20151226-all-titles-in-ns0", "r", "utf-8")
    fout = codecs.open("zhwiki-titles-converted", "w", "utf-8")
    #[dic_TW, dic_HK, dic_CN] = mdic()
    # print(HanziConv.toSimplified("!_"))
    cnt = 0
    while(True):
        cnt += 1
        if(cnt % 10000 == 0):
            print(cnt)
        line = fin.readline()
        if(line == ""):
            break
        if(check_contain_chinese(line)):
            fout.write(HanziConv.toSimplified(line))