def extract_langlinks(sql, fo): total = 0 category = 0 instance = 0 template = 0 o = open(fo, 'w') with open(sql) as f: for line in f: if line.startswith('INSERT'): line = line[line.index('('):] line = line.strip('\n').strip(';').strip(')').strip('(') #删除前后的(和) for tri in line.split('),('): #以),(分割,得出每个item tri = tri.replace("'",'').replace("'",'') _id, lan, link = tri.split(',',2) #因为link里可能有逗号,要限制只分2次 if lan == 'zh': total += 1 if link.startswith('Category:'): category += 1 if link.startswith('Template:'): template += 1 print _id, HanziConv.toSimplified(link).encode('utf-8') link = link.replace('_', ' ') o.write('%s\t%s\n'%(_id,HanziConv.toSimplified(link).encode('utf-8'))) instance = total - category - template print "Total:%d, Category:%d, Instance:%d, Template:%d"%(total, category, instance, template)
def replace(x): x = x.replace('"', "").replace("\r\n", " ").replace("\n", " ").replace(",", ",") x = HanziConv.toSimplified(x) x = [a for a in cut(x) if a not in stop_words] x = " ".join(x) return x
def clean(text): text = text.strip() # text = text.lower() text = HanziConv.toSimplified(text) text = full2half(text) text = re.sub("\\#.*?#|\\|.*?\\||\\[.*?]", "", text) # text = re.sub("\s*", "", text) return text
def get_people_name(self): if self.get_main_content() != None: term_list = segment.seg( HanziConv.toSimplified(self.get_main_content())) for term in term_list: if str(term.nature) == NLP_Constant.people_name_pos: return HanziConv.toTraditional(str(term.word)) return None
def to_S(k): txt=X.content[k].strip() txt = re.sub('\t|\r', '\n',txt) txt = txt.replace('\n\n', '\n') txt = re.sub(' |\u3000', ' ', txt) txt=HanziConv.toSimplified(txt) txt=txt.strip() return (X.shop_url[k], X.post_time[k],txt,int(X.score[k]),len(txt))
def generate_qimai_addition_dataset(model_type="bert"): test_df = pickle_load(path_cache / "test_df.pkl") qimai_test_id = pickle_load(path_cache / f"{model_type}_qimai_test_id.pkl") appname2appdesc = pickle_load(path_cache / "appname2appdesc.pkl") apkname2appdesc = pickle_load(path_cache / "apkname2appdesc.pkl") test_df["appname"] = test_df["new_appname"] qimai_test_df = test_df.merge(apkname2appdesc) qimai_test_df = qimai_test_df[["appname", "app_desc"]] chusai_test_df = pickle_load(path_cache / "chusai_test_df.pkl") chusai_test_df = chusai_test_df.loc[~chusai_test_df["appname"].isna(), ["appname", "app_desc"]] appname2appdesc = pd.concat( [appname2appdesc, chusai_test_df, qimai_test_df], axis=0, sort=False) appname2appdesc["desc_len"] = appname2appdesc["app_desc"].str.replace( "[\x00-\xff”“•]", "").str.len() appname2appdesc["appname"] = appname2appdesc["appname"].str.lower( ).str.replace(" ", "") appname2appdesc["appname"] = [ HanziConv.toSimplified(x) for x in appname2appdesc["appname"] ] appname2appdesc = appname2appdesc.sort_values("desc_len").drop_duplicates( "appname", keep="last") appname2appdesc = appname2appdesc.loc[appname2appdesc["desc_len"] >= 8] test_df_new = test_df.copy() test_df_new = test_df_new.loc[~test_df["id"].isin(qimai_test_id)] test_df_new["appname"] = test_df_new["appname"].str.lower().str.replace( " ", "") test_df_new["appname"] = [ HanziConv.toSimplified(x) for x in test_df_new["appname"] ] test_df_new = test_df_new.merge(appname2appdesc) qimai_addition_test_id = test_df_new["id"].tolist() qimai_addition_test_dataset = generate_tensor_data(test_df_new["app_desc"], model_type) qimai_addition_test_dataset = TensorDataset(*qimai_addition_test_dataset) pickle_save(qimai_addition_test_id, path_cache / "qimai_addition_test_id.pkl") pickle_save( qimai_addition_test_dataset, path_tensor_dataset / f"{model_type}_qimai_addition_test_dataset.pkl")
def __iter__(self): for content, (page_id, title) in self.wiki.get_texts(): yield doc2vec.LabeledSentence( # 1. 对content中的每一个c, # 2. 转换成简体中文之后用jieba分词 # 3. 加入到words列表中 words=[w for c in content for w in jieba.cut(HanziConv.toSimplified(c))], tags=[title])
def normalize(text): toSim = HanziConv.toSimplified(text.replace('\n', ' ')) t2 = unicodedata.normalize('NFKC', toSim) table = { ord(f): ord(t) for f, t in zip(u',。!?【】()%#@&1234567890', u',.!?[]()%#@&1234567890') } t3 = t2.translate(table) return t3
def clean(self, line): line = re.sub(r'\[[\u4e00-\u9fa5a-z]{1,4}\]|\[aloha\]', '', line) line = re.sub(EMOJI_UNICODE, '', line) line = re.sub(self.html_texts, '', line) if re.search(r'[\u4300-\u9fa5]+', line): line = HanziConv.toSimplified(line) return re.sub(' {2,}|\t', ' ', line).lower() else: return None
def len_tokenizer(self, input): # 繁体转简体 text = HanziConv.toSimplified(input) # 分词 text = jieba.lcut(text) # 去除停用词 if self._stopwordset: text = self.movestopwords(text) return len(text)
def clear_text(x): x = BeautifulSoup(x, 'html.parser').text x = html.unescape(x) x = HanziConv.toSimplified(x) x = re.sub(r'\s+', '', x) # tab \n 去除 匹配任何非空白字符。等价于 [^ \f\n\r\t\v]。 x = re.sub(r'[\((【](.*?)[\))】]', '', x) # 替换 ()是单元, ?非贪婪匹配,否则.*把后面的括号也匹配掉了 x = re.sub(r'([–-—=…]*)', '',x) x = x.strip() return x
def terms2VecIDs(terms): ans = [] for term in terms: ID = word2id.get(HanziConv.toSimplified(term)) #Problem: Some terms are not pretrained, like '食记','咖哩','捷运' if ID == None: ans.append(0) else: ans.append(ID) return ans
def main(args): with open(args.input, encoding='utf8') as f: lines = f.read().splitlines() if args.format == 'lines': lines.append('<song>') tot = len(lines) parsed_line = [] thu1 = thulac.thulac(seg_only=True) with open(args.output, encoding='utf8', mode='w') as f: cnt = 0 for line in lines: if args.format == 'lines': line = HanziConv.toSimplified(line) if cnt % 100 == 0: print('status: %d/%d' % (cnt, tot)) cnt += 1 if line == '<song>': if len(parsed_line) == 0: continue n = len(parsed_line) # 控制每句总长度为maxlen for i in range(n): l = len(parsed_line[i]) if l > args.maxlen: continue ctrl_list = parsed_line[i] for k in range(i + 1, n + 1): if k == n or l + len(parsed_line[k]) + 1 > args.maxlen: f.write(' '.join(ctrl_list) + '\n') break ctrl_list.append('<lbreak>') ctrl_list += parsed_line[k] l += len(parsed_line[k]) + 1 parsed_line = [] continue # 用thulac或jieba进行分词 if args.segment == 0: seg_list = jieba.lcut(line) else: seg_list = thu1.cut(line) seg_list = [t[0] for t in seg_list] seg_list2 = [] for word in seg_list: seg_list2 += parse_segged_word(word) seg_list = seg_list2 if args.segment == 0: seg_list2 = [] for word in seg_list: if word == '<num>': seg_list2.append(word) else: seg_list2 += list(word) seg_list = seg_list2 if len(seg_list) > 0: parsed_line.append(seg_list) print('Finished')
def terms2Vec(terms): vec = np.zeros(len(embeddings[0])) for term in terms: ID = word2id.get(HanziConv.toSimplified(term)) #Problem: Some terms are not pretrained, like '食记','咖哩','捷运' if ID == None: vec += embeddings[0] else: vec += embeddings[ID] vec /= len(terms) return vec
def terms2Vec(terms): vec = np.zeros(len(embeddings[0])) for term in terms: ID = word2id.get(HanziConv.toSimplified(term)) if ID == None: vec += embeddings[0] else: vec += embeddings[ID] vec /= len(terms) return vec
def chinese_tokenizer(self, documents): for document in documents: # 繁体转简体 text = HanziConv.toSimplified(document) # 分词 text = jieba.lcut(text) # 去除停用词 if self._stopwordset: text = self.movestopwords(text) yield text
def process_chinese_data(line, use_target, use_first_target): text = line['text'] target = line['target'] tar_idx = line['indices'] label = line['label'] words_text = text.split() tar_idx_list = [] tokenized_text = [] found_target = False words_text = [HanziConv.toSimplified(word).lower() for word in words_text] if use_target == 'token': for idx in tar_idx: words_text = [ TARGET if str(i) in idx else word for i, word in enumerate(words_text) ] tokenized_text.extend(words_text) tar_idx_list = [1 if word == TARGET else 0 for word in tokenized_text] else: norm_target = [HanziConv.toSimplified(target).lower()] last_tar_end_idx = 0 for idx in tar_idx: tar_start_idx = int(idx[0]) if tar_start_idx != 0: norm_non_target_words = words_text[ last_tar_end_idx:tar_start_idx] tokenized_text.extend(norm_non_target_words) tar_idx_list.extend([0] * len(norm_non_target_words)) tokenized_text.extend(norm_target) if use_first_target and found_target: tar_idx_list.extend([0] * len(norm_target)) else: tar_idx_list.extend([1] * len(norm_target)) found_target = True last_tar_end_idx = tar_start_idx + 1 if last_tar_end_idx < len(words_text) - 1: norm_non_target_words = words_text[last_tar_end_idx:] tokenized_text.extend(norm_non_target_words) tar_idx_list.extend([0] * len(norm_non_target_words)) return tokenized_text, target, tar_idx_list, label
def chinese_tokenizer(documents): """ 把中文文本转为词序列 繁体转简体、英文转小写 """ for document in documents: text = HanziConv.toSimplified(document) text = text.lower() yield list(cut(text))
def insert_example(c, definition_id, starting_example_id, example): # The example should be a list of Example objects, such that # the first item is the 'source', and all subsequent items are the # translations examples_inserted = 0 trad = example[0].content simp = HanziConv.toSimplified(trad) jyut = example[0].pron pin = "" lang = example[0].lang example_id = database.insert_chinese_sentence(c, trad, simp, pin, jyut, lang, starting_example_id) # Check if example insertion was successful if example_id == -1: if trad == "X" or trad == "x": # Ignore examples that are just 'x' return 0 else: # If insertion failed, it's probably because the example already exists # Get its rowid, so we can link it to this definition example_id = database.get_chinese_sentence_id( c, trad, simp, pin, jyut, lang) if example_id == -1: # Something went wrong if example_id is still -1 return 0 else: examples_inserted += 1 database.insert_definition_chinese_sentence_link(c, definition_id, example_id) for translation in example[1:]: sentence = translation.content lang = translation.lang # Check if translation already exists before trying to insert # Insert a translation only if the translation doesn't already exist in the database translation_id = database.get_nonchinese_sentence_id(c, sentence, lang) if translation_id == -1: translation_id = starting_example_id + examples_inserted database.insert_nonchinese_sentence(c, sentence, lang, translation_id) examples_inserted += 1 # Then, link the translation to the example only if the link doesn't already exist link_id = database.get_sentence_link(c, example_id, translation_id) if link_id == -1: database.insert_sentence_link(c, example_id, translation_id, 1, True) return examples_inserted
def get_download_url(name, ep, keyword, translation_team, **dict): """ Search download url in dmhy.org """ root_url = 'https://share.dmhy.org' payload = {'keyword': keyword + ' ' + '{:0>2}'.format(ep)} user_agent = { 'User-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML,' 'like Gecko) Chrome/41.0.2228.0 Safari/537.36' } print('DMHY scraper is searching for {} of {}'.format(ep, name)) content = net.request_get_content(root_url + '/topics/list', retry=5, params=payload) soup = bs4.BeautifulSoup(content, 'lxml') trs = soup.find_all('tr') if len(trs) == 0: raise FileNotFoundError found_flag = False download_url = '' unified_name = name.lower() print('Unified name:{}'.format(unified_name)) # Skip the table header for tr in trs[1:]: a = tr.select('td.title > a')[0] # Check the correctness of entry entry_desc = '' for string in a.strings: entry_desc += string # Eliminating spaces entry_desc = HanziConv.toSimplified(entry_desc.strip()) try: print('Searching: {0}'.format(entry_desc)) except: print('Experiencing encoding problem, but search is still going on.') print('Searching:', entry_desc.encode('utf-8')) unified_entry_desc = entry_desc.lower() if unified_name in unified_entry_desc: # Translation team check if (translation_team != [] and not any(trans_t.lower() in unified_entry_desc for trans_t in translation_team)): continue download_page_url = a['href'] print('download_page link:{0}'.format(download_page_url)) download_page_content = net.request_get_content( root_url + download_page_url, retry=5) soup1 = bs4.BeautifulSoup(download_page_content, 'lxml') url_list = soup1.find(id='tabs-1') p = url_list.find('p') download_url = p.find('a')['href'] break if download_url == '': raise FileNotFoundError return "https:" + download_url
def add_edge(dict_re_match_object): """ upload edge created from regular expression matched object. (9,'En-3_使用者','MOUNTAIN','2015-09-02 13:44:06','','uppercase','page') Keyword Arguments: re_match_object -- re object """ # iterate nodes batch for index, value in dict_re_match_object.items(): if value is not None: item = dict_re_match_object.get(index) edge_type = item.group(7)[1:-1] if edge_type == 'page': page_title = item.group(3)[1:-1] cat_title = item.group(2)[1:-1] if '\\n' in cat_title: end = cat_title.split("\\n") cat_title = end[-1] if '\\n' in page_title: end = page_title.split("\\n") page_title = end[-1] page_title = page_title.replace(" ", "_") # page subtype is 0 page_title = HanziConv.toSimplified(page_title) cat_title = HanziConv.toSimplified(cat_title) graph.add_edge(cat_title, page_title, subtype=0) if edge_type == 'subcat': subcat_title = item.group(3)[1:-1] cat_title = item.group(2)[1:-1] if '\\n' in cat_title: end = cat_title.split("\\n") cat_title = end[-1] if '\\n' in subcat_title: end = subcat_title.split("\\n") subcat_title = end[-1] subcat_title = subcat_title.replace(" ", "_") if subcat_title == cat_title: continue # subcategory subtype is 1 subcat_title = HanziConv.toSimplified(subcat_title) cat_title = HanziConv.toSimplified(cat_title) graph.add_edge(cat_title, subcat_title, subtype=1) g.addEdge(cat_title, subcat_title)
def simplify_text(self): """ Simplifies text input into simplified Chinese characters in preparation for segmentation. For reasons why simplification is done first, check documentation for method 'segment_text'. Returns: text_sim (str): A string containing simplified Chinese characters, obtained by simplifying text_input using hanziconverter. """ text_filtered = self.filter_text() return hanzC.toSimplified(text_filtered)
def expression_process(text): # print(text) strings = acp.expression_extract(text) res = [] for i in strings: if i[1] is 'str': string = HanziConv.toSimplified(i[0]).lower() res += clean_stopwords(list(jieba.cut(string)), stopwords_new) else: res += [i[0]] return res
def process(inqueue, outqueue): while True: line = inqueue.get() if line is None: break words = [HanziConv.toSimplified(w) for w in line] words = [w2 for w1 in words for w2 in jieba.cut(w1, cut_all=False)if len(w2) > 1] words = [w.encode('utf-8') for w in words] text = b' '.join(words) outqueue.put(text) return
def add_node(dict_re_match_object): """ upload node created from regular expression matched object. (6,'深圳证券交易所上市公司',13,0,0) Keyword Arguments: re_match_object -- re object """ # iterate nodes batch for index, value in dict_re_match_object.items(): if value is not None: item = dict_re_match_object.get(index) graph.add_node(HanziConv.toSimplified(item.group(2)[1:-1]))
def get_titles(file_path): titles = set() with gzip.open(file_path, 'rb') as f: for line in f: title = line.decode('utf-8').rstrip('\n') for c in title: if u'\u4e00' <= c <= u'\u9fff': titles.add(HanziConv.toSimplified(title)) break titles.add(title) return titles
def mmtv_genre(soup): genres = "" if soup.find("span", {"class": "posts-inner-details-text-under"}): soup = soup.find("span", {"class": "posts-inner-details-text-under"}) genres_mmtv = soup.find_all("span") genres = [] for genre in genres_mmtv: genre = HanziConv.toSimplified(str(genre.text)) genre = get_genre_type(genre) genres.append(genre) return genres
def __init__(self, url): self.url = url try: res = requests.get(url, timeout=3) self.page = BeautifulSoup(res.text, "html.parser") self.timeout = False except requests.exceptions.RequestException as e: errorLogger.exception(f"Error handling web page:{self.url}" + " -- request timeout") self.page = BeautifulSoup(open("emptyPage.html", encoding="utf8"), "html.parser") self.timeout = True self.isSong = url.startswith("https://mojim.com/cny") if self.isSong: try: text = self.page.find(id="fsZx1") texts = text.prettify().split("\n") # store processed and filtered lyric filtered = [] for s in texts: s = s.strip() if (s == "<ol>"): break if not s.startswith("<") and s.find(":") == -1 and s.find( "※") == -1 and not s.startswith("["): filtered.append(s) self.artistName = HanziConv.toSimplified(filtered.pop(0)) self.songName = HanziConv.toSimplified(filtered.pop(0)) s = "" for t in filtered: s = s + t + os.linesep # return a string representation of the lyric. each line is separated by # \n self.lyric = HanziConv.toSimplified(s) except: self.isSong = False errorLogger.exception( f"Error handling web page:{self.url}\n\t{self.page.text}") self.relatedPages = self.relatedSongs()
def get_info(app: str = "Spotify") -> Tuple[str, str, float, str, float]: """ Get information about the track in play. Parameters ---------- app : str, optional Name of the app, by default "Spotify" Returns ------- Tuple[str, str, float, str, float] Title, artists, position, status, duration of the track Examples -------- >>> title, artists, position, status, duration = get_info("Spotify") >>> status in [None, "playing", "paused"] True """ template = f""" on run if application "{app}" is running then tell application "{app}" set currentInfo to {{name of current track, "|", artist of current track, "|", player position, "|", player state, "|", duration of current track}} end tell end if return currentInfo end run """ code, res, error = osascript.run(template, background=False) title = artists = position = status = duration = None if code == 0: segments = res.split("|") title, artists, position, status, duration = map( lambda x: x.strip(' ,"'), segments) if all(x is not None for x in [title, artists, position, status, duration]): position = float(position) duration = float(duration) if duration <= 1200: duration = duration else: duration /= 1000 title = HanziConv.toSimplified(title) title = re.sub(r"[[《<((【「{].*?[]】)」}>)》]", "", title) title = title.rsplit("-", 1)[0] else: logger.debug(error) return title, artists, position, status, duration
def normalize(self, ss): # 繁简转换 ss1 = HanziConv.toSimplified(ss) # 大小写转换 ss2 = ss1.lower() # 过滤出 中文 字母 数字 其余字符用空格表示 ss3 = self.char_filter.sub(r' ', ss2) # 多个空格合并 ss4 = ' '.join(ss3.strip().split()) if not isinstance(ss4, str): ss4 = ss4.encode('utf8') return ss4
def sent_extract(sent): sent = HC.toSimplified(sent) extracted = [] for pt in [emoji_pt, link_pt, share_pt1, share_pt2]: ex = ';'.join(pt.findall(sent)) if ex: sent = pt.sub('', sent) extracted.append(ex) sent = remove_pt.sub('', sent).strip() return sent, extracted
def seg(x): x = HanziConv.toSimplified(x) x = re.sub('\x05|\x06|\x07|\.\.|\.\.\.', ' ', x) #w = posseg.cut(x.upper()) #w = [word for word, flag in w if word not in stopwords and flag in keep_property] w = jieba.cut(x.upper()) w = [ word.strip() for word in w if word not in stopwords and len(word.strip()) > 0 ] #w=[word.strip() for word in w if len(word)>0] return ' '.join(w)
def chinese_tokenizer(documents): """ 把中文文本转为词序列 """ for document in documents: # 繁体转简体 text = HanziConv.toSimplified(document) # 英文转小写 text = text.lower() # 分词 yield list(cut(text))
def get_char_list(query): query = HanziConv.toSimplified(query.strip()) regEx = re.compile('[\\W]+') # 我们可以使用正则表达式来切分句子,切分的规则是除单词,数字外的任意字符串 res = re.compile(r'([\u4e00-\u9fa5])') # [\u4e00-\u9fa5]中文范围 sentences = regEx.split(query.lower()) str_list = [] for sentence in sentences: if res.split(sentence) == None: str_list.append(sentence) else: ret = res.split(sentence) str_list.extend(ret) return [w for w in str_list if len(w.strip()) > 0]
def remove_sepcail_segment(content, jieba_stop_words): content = HanziConv.toSimplified(content) seg_list = jieba.cut(content) seg_clean = [] for word in seg_list: clean_word = getChinese(word).strip() if clean_word== '': continue seg_clean.append(clean_word) seg_clean = [word for word in seg_clean if not word in jieba_stop_words] return ','.join(seg_clean)
def segment2( self, sent ): ssent = HanziConv.toSimplified( sent ) res = self.segmenter.segment( ssent ) arr = [] start = 0 for i in range( res.size() ): length = len(res.get(i)) arr.append( sent[start:start+length] ) start += length return arr
def convert_to_simplified(text): if u'歷' in text: text = text.replace(u'歷', u'历') return HanziConv.toSimplified(text)
def traditional_to_simplified(ustring): return HanziConv.toSimplified(ustring)
def simplify_or_none(text): if text is None: return None else: return HanziConv.toSimplified(text)
#coding:utf-8 from hanziconv import HanziConv stop_file = open("./other_data/stop_word.txt", 'r') stop_word_array = [] for line in stop_file: temp = line.replace("\n", "") temp = HanziConv.toSimplified(temp) if temp not in stop_word_array: stop_word_array.append(temp) stop_file1 = open("./generated_data/stop_word_final.txt", "w") for i in stop_word_array: stop_file1.write(i.encode('utf8')+"\n")
def chinese_tokenizer(s, lower=True): s = unicode(s) if lower: s = hanzi.toSimplified(s) return [t[0] for t in jieba_tokenize(s)]
def simplified_eq(a, b): return len(a) == len(b) and \ HanziConv.toSimplified(a[0]) == \ HanziConv.toSimplified(b[0])
# 草 ("grass"), "肏" is the actual character. "艹" is not a real character # but it's used this way "操你", "草你", "日你", # f**k you "操他", "草他", "日他", # f**k his "操她", "草她", "日她", # f**k her # Discrimination (racial slurs) "小日本", # little Japanese "台湾狗", # Taiwanese dogs "共产中国", # communist Chinese "流氓国家", # rogue country "人渣", # human slag "我去", # this is verbal and bad "鬼子" # devil, usually a suffix ] BAD = [HanziConv.toSimplified(word) for word in bad_init] + \ [HanziConv.toTraditional(word) for word in bad_init] INFORMAL = [ # Hello "你好", # nǐ hǎo; The standard "hello" greeting. "您好", # nín hǎo; The same "hello" greeting as above "你怎么样", # nǐ zěnmeyàng?; "What's up?", "How are you doing?" # Good afternoon "午安", # wǔ'an; note: seldom used in the Mainland. "下午好", # xìawǔ hǎo! Seldom used in the Republic of China # Good evening / Good night "晚安", # wǎn'an; Literally "Peace at night", Good night. "晚上好", # wǎnshang hǎo; Good evening!
m = re.search(ur"^(\[.+?\])(.+?):", s) if m: s = m.group(2) + m.group(1) else: m = re.search(ur"^\[.+?\](.*)", s) if m: s = m.group(1) return s if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("input", action="store", nargs = 1) parser.add_argument("output", action="store", nargs = 1) parser.add_argument("--encoding", action="store", default="utf_8_sig", nargs=1) parser.add_argument("--traditional", action="store_true", default=False) args = parser.parse_args() buf = codecs.open(args.input[0], "rb", args.encoding).read() if args.traditional: buf = HanziConv.toTraditional(buf) else: buf = HanziConv.toSimplified(buf) lines = buf.split("\n") lines.sort(key = sort_func) codecs.open(args.output[0], "wb", args.encoding).writelines(lines)
dic_postive ={} dic_negative = {} dic_term_orientation = {} pos = 0.0 neg = 0.0 oth = 0.0 reader = csv.reader(open("./generated_data/training_file.csv", "rb")) for row in reader: if row[1] == "1": pos += 1 elif row[1] == "0": neg += 1 elif row[1] == "2": oth += 1 flag = row[1] temp = HanziConv.toSimplified(row[3]) words = jieba.cut(temp, cut_all=False) word_is_counted = [] for w in words: if w not in word_is_counted: word = w.encode('utf8') if (word not in punctuation and word not in stop_word_list) and only_nonascii(word) != "": if flag == '1': if word not in dic_postive: dic_postive[word] = 2 else: dic_postive[word] += 1 if word not in dic_negative: dic_negative[word] = 1 elif flag == '0': if word not in dic_negative:
# [dic_TW, dic_HK, dic_CN] = mdic() # str_TW = conv(a, dic_TW) # str_HK = conv(c, dic_HK) # str_CN = conv(b, dic_CN) # print a, ' <-> ', str_TW, '\n', c, ' < -> ', str_HK, '\n', b, ' < -> ', # str_CN def check_contain_chinese(check_str): for ch in check_str: if u'\u4e00' <= ch <= u'\u9fff': return True return False if __name__ == '__main__': fin = codecs.open("zhwiki-20151226-all-titles-in-ns0", "r", "utf-8") fout = codecs.open("zhwiki-titles-converted", "w", "utf-8") #[dic_TW, dic_HK, dic_CN] = mdic() # print(HanziConv.toSimplified("!_")) cnt = 0 while(True): cnt += 1 if(cnt % 10000 == 0): print(cnt) line = fin.readline() if(line == ""): break if(check_contain_chinese(line)): fout.write(HanziConv.toSimplified(line))