def find_pinyin(word): tones = None characters = list(word) i = 0 has_english = False while not has_english and i < len(characters): has_english = not hanzidentifier.has_chinese(characters[i]) i += 1 if not has_english: ret = p.get_pinyin(word, tone_marks="numbers") words = ret.split("-") tones = [] for j in range(len(words)): tones.append(words[j][-1]) tones = tone_change(tones) tones = "".join(tones) return tones
def is_entirely_chinese(s): """ Check if every character in the string is Chinese. *s* string to search in """ debug('TEST(IEC): '+str(s)) for c in s: if not ziid.has_chinese(c): return False debug('TEST(IEC): '+c+' is not chinese.') return True
def is_entirely_chinese(s): """ Check if every character in the string is Chinese. *s* string to search in """ debug('TEST(IEC): '+str(s)) for c in s: if not ziid.has_chinese(c): debug('TEST(IEC): '+c+' is not chinese.') return False return True
def is_chinese(self,text): ret = [] if hanzidentifier.has_chinese(text): #chinese chineseArray = re.findall(ur'[\u4e00-\u9fff]+',text) chineseChars = len(str(chineseArray)) - (2+len(chineseArray)) if len(text) / 3 < chineseChars: #At least 1/3 of the sentence in Chinese characteres if hanzidentifier.identify(text) is hanzidentifier.SIMPLIFIED: ret = [[1,'ZH-CHS']] elif hanzidentifier.identify(text) is hanzidentifier.TRADITIONAL: ret = [[1,'ZH-CHT']] elif hanzidentifier.identify(text) is hanzidentifier.BOTH or hanzidentifier.identify(text) is hanzidentifier.MIXED: ret = [[1,'ZH-CHT'],[1,'ZH-CHS']] return ret
def get_indices(keyword, dataframe): """ Retrieves document indexes from the dataframe based on the keyword. Returns a list of integers that correspond to the tweet or Weibo post index in the corresponding corpus. Arguments: keyword -- (str) a string that is either English or Chinese. dataframe -- (pd.DataFrame) a corpus of social media posts from Twitter or Weibo. """ indices = [] if hanzidentifier.has_chinese(keyword): indices.extend( dataframe[dataframe['text'].str.contains(keyword)].index) else: word_regex = r"[a-z]+" match = re.match(word_regex, keyword) if match: indices.extend(dataframe[dataframe['joined_lems'].str.contains( pat=fr'\b{keyword}\b', regex=True, case=False)].index) return indices
def create_wc(word, indices_list, filename, dataframe): """ Generate and return a word cloud based on the keyword. Arguments: word -- (str) a string that is either English or Chinese. indices_list -- (list) list of integers corresponding to the index of a dataframe. filename -- (str) dataframe -- (pd.DataFrame) a corpus of social media posts from Twitter or Weibo. """ all_posts = [] response_weibo = requests.get(weibo_mask_url) response_twitter = requests.get(twitter_mask_url) background_cn = np.array(Image.open(BytesIO(response_weibo.content))) background_en = np.array(Image.open(BytesIO(response_twitter.content))) if hanzidentifier.has_chinese(word): font_path = 'SourceHanSansSC-Regular.otf' #need to download the font file before loading background = background_cn stopwords = chinese_stopwords else: font_path = None background = background_en stopwords = english_stopwords for post_index in indices_list: all_posts.append(dataframe.loc[dataframe.index == post_index, "text"].iloc[0]) wordcloud = WordCloud(background_color='white', max_words=100, width=200, height=100, font_path='SourceHanSansSC-Regular.otf', mask=background, stopwords=stopwords).generate(" ".join(all_posts)) plt.imshow(wordcloud) plt.axis("off") plt.figure(figsize=(1000, 500)) wordcloud.to_file(filename)
final = [[[], []] for y in range(len(dat))] i = 0 while i < len(dat): print(i) dict = {} a = eval(dat[i][0]) b = eval(dat[i][1]) for k in b: if b[k] not in a: continue if b[k] in a: dict[k] = a[b[k]] #print(dict) for k in dict: if hanzidentifier.has_chinese(k) and len(k) > 1: for j in k: if j != "" and j != " ": final[i][0].append(j) if dict[k] == "B-entity" and j == k[0]: final[i][1].append(dict[k]) continue if dict[k] == "B-entity" and j != k[0]: final[i][1].append("I-entity") continue if dict[k] == "B-action" and j == k[0]: final[i][1].append(dict[k]) continue if dict[k] == "B-action" and j != k[0]: final[i][1].append("I-action") continue
def do_GET(self): self.send_response(200) query = parse.urlsplit(self.path).query query_dict = parse.parse_qs(query) if "keyword_search_frontend.css" in self.path: self.send_header('Content-type', 'text/css; charset=utf-8') self.end_headers() f = open("keyword_search_frontend.css", encoding="utf-8") html = f.read() f.close() self.wfile.write(html.encode("utf-8")) elif "keyword_search_frontend.js" in self.path: self.send_header('Content-type', 'text/javascript; charset=utf-8') self.end_headers() f = open("keyword_search_frontend.js", encoding="utf-8") html = f.read() f.close() self.wfile.write(html.encode("utf-8")) elif self.path == "/": self.send_header('Content-type', 'text/html; charset=utf-8') self.end_headers() f = open("keyword_search_frontend.html", encoding="utf-8") html = f.read() f.close() self.wfile.write(html.encode("utf-8")) elif "png" in self.path: self.send_header('Content-type', 'image/png;') self.end_headers() wc = open("word_cloud.png", "rb") self.wfile.write(wc.read()) wc.close() else: self.send_header('Content-type', 'text/html; charset=utf-8') self.end_headers() print(query_dict) if query_dict['type'][0] == 'category': language = query_dict["lang"][0] selected_category = query_dict["categories"][0] print(language, selected_category) results = dropdown_menu(selected_category, language) posts_table = put_in_table(results) self.wfile.write(b"<html>" + posts_table.encode("utf-8") + b"</html>") else: keyword = query_dict["keyword"][0] if hanzidentifier.has_chinese(keyword): #checks for language dataframe = weibo_df else: keyword = lemmatize(keyword.lower()) dataframe = twitter_df if keyword in english_vocab or keyword in chinese_vocab: #checks if keyword in corpus doc_indices = get_indices(keyword, dataframe) results = get_posts(keyword, doc_indices, dataframe) create_wc(keyword, doc_indices, "word_cloud.png", dataframe) posts_table = put_in_table(results) random_num = str(random.randint(1, 1000)) self.wfile.write(b"<html>" + posts_table.encode("utf-8") + b'<img src="word_cloud.png?' + random_num.encode("utf-8") + b'">' + b"</html>") else: message = f'Sorry, "{keyword}" is not in our corpus. Make sure the keyword is only one word and there are no spaces.' self.wfile.write(b"<html>" + message.encode("utf-8") + b"</html>")
start = perf_counter() Data = pd.read_csv(filename, sep=',', header=None, usecols=[0]) Data.columns = ["Chinese"] end = perf_counter() print(f"Took: {round(end-start,sigfigs=2)}") Freqs = {} print("finding freq...") start = perf_counter() for i in range(len(Data)): characters = list(Data["Chinese"][i]) for character in characters: if hanzidentifier.has_chinese(character): if character not in Freqs.keys(): Freqs[character] = 1 else: Freqs[character] += 1 else: pass #print(f"skipped {character}") end = perf_counter() print(f"Took: {round(end-start,sigfigs=2)}") Freq_data = pd.DataFrame(columns=["Chinese", "Freq"]) print("rearranging...") start = perf_counter()
import os import hanzidentifier # to see if file contains chinese import pinyin # to convert chinese to english # io.open("myfile.txt", 'r', encoding="windows-1252") print("JUST DANCE KTAPE ENCRYPTOR BY YUNYL") filename = input("Enter JDU ktape: ") with io.open(filename, 'r', encoding="utf-8") as json_file: data = json.load(json_file) # check all lyrics for chinese and convert to its english pronunciation for track in data["Clips"]: # if the track contains chinese characters if hanzidentifier.has_chinese(track['Lyrics']): # convert to enlish track['Lyrics'] = pinyin.get( track['Lyrics'], format="strip", delimiter=" ") + " " i = 0 songname = (json.dumps(data['MapName'], sort_keys=False, indent=4).lower()) songnamenormal = songname[1:-1] songnameready = (("WII_" + songnamenormal)) songnamereadyt = songnameready + "_tml_karaoke.ktape.ckd" filewhereput = open(songnamereadyt, "w") howmanyclipsinktape = (len(data['Clips'])) howmanyclipsinktapeadd10000 = howmanyclipsinktape * 200 + 3514 howmanyclipsinktapehexreadywith10000 = (hex( int(howmanyclipsinktapeadd10000)).replace("0x", ""))
def test_has_chinese(self): self.assertFalse(hanzidentifier.has_chinese(UNKNOWN)) self.assertTrue(hanzidentifier.has_chinese(BOTH))
def parse_file(filename_traditional, filename_simplified_jyutping, entries): simplified = traditional = [] with open(filename_traditional, "r", encoding="utf8") as f: reader = csv.reader(f, delimiter=" ") traditional = list(reader) # The Kaifangcidian data for jyutping is horrible. # The entire data is on a single line, printed like a flat Python list. # The entry may be a single item in the array, or multiple items. # The Jyutping pronunciation is a separate item for each character in the entry. # The translations to Mandarin may, or may not follow the Jyutping! # And there is no separator between data for different entries :) last_line = "" with open(filename_simplified_jyutping, "r", encoding="utf8") as f: last_line = f.readlines()[-1] simplified = ast.literal_eval(last_line) index = 0 for row in range(len(traditional)): if row < 9: # The first nine rows are comments and headers continue trad = traditional[row][0] # Horrible data workaround 1: # In KFCD Jyutping data, when the entry has Chinese characters in it, # the entry is presented as a single string in the array. (This is sane.) # If it does not (e.g. the word 'pat pat'), each series of characters, delineated # by a space, is a separate entry in the array ('pat pat' => ["pat", "pat"]) trad_len = len(trad.split(" ")) if not hanzidentifier.has_chinese(trad): simp = "".join(simplified[index:index + trad_len]) else: simp = simplified[index] # Horrible data workaround 2: # In KFCD Jyutping data, the Jyutping for each word in an entry # is presented as a separate string. # To find the indices that correspond to the entry we just extracted, # use the data from the KFCD Yale edition (which is formatted as a CSV) to # determine how many items comprise the Jyutping pronunciation. # One cannot use the string length of the entry, as it may contain punctuation # (e.g. ',') that has no corresponding Jyutping syllable, AND the entry # may be split up into multiple items (as described in horrible # workaround #1). jyut_len = len(traditional[row][1].split(" ")) jyut = " ".join(simplified[index + trad_len:index + trad_len + jyut_len]) pin = (" ".join( lazy_pinyin(trad, style=Style.TONE3, neutral_tone_with_five=True)).lower().replace( "v", "u:")) # Horrible data workaround 3: # In the KFCD Yale data, all the definitions are listed as a single item, separated # by the wide-character ','. Some entries have definitions, and some do not. # In the KFCD Jyutping edition, the definitions are also listed all as a single item. # However, many words do not have definitions; if there are no definitions then # we do NOT need to advance the index by 1 more item (which would have been # the definitions). if traditional[row][2]: defs_traditional = traditional[row][2].split(",") defs_simplified = simplified[index + trad_len + jyut_len].split(",") definitions = [] for (def_traditional, def_simplified) in zip(defs_traditional, defs_simplified): if def_traditional != def_simplified: definitions.append(def_traditional + " – " + def_simplified) else: definitions.append(def_traditional) index += trad_len + jyut_len + 1 else: definitions = ["(沒有對應漢語詞彙)"] index += trad_len + jyut_len entry = objects.Entry(trad=trad, simp=simp, pin=pin, jyut=jyut, defs=definitions) if trad in entries: entries[trad].append(entry) else: entries[trad] = [entry]
for filename in os.listdir(hkcancor_path): file = open(os.path.join(hkcancor_path, filename)) print("processing: " + filename) sentence = [] x = True while True: line = file.readline() if "<sent_tag>" in line: x = True while x: for char in line: if has_chinese(char): sentence.append(char) elif char == "。" or char == ",": sentence.append(char) if "</sent_tag>" in line: labels.append(0) if random() <= 0.3 else labels.append(1) print(''.join(sentence)) sentences.append(''.join(sentence)) sentence = [] x = False break break if not line: