def refine(ch_rhyme, ch, alignment=False, topn=50): if alignment: model = models.Word2Vec.load(_w2v_model_path) else: model = models.Word2Vec.load(_w2v_model_path) rdict = RhymeUtil() int2ch, ch2int = get_vocab() rhyme = rdict.get_rhyme(unicode(ch_rhyme, "utf-8")) result = [ t[0] for t in model.wv.most_similar(positive=[unicode(ch, "utf-8")], topn=topn) ] filtered_result = filter(lambda ch: ch in ch2int, result) for target in filtered_result: if rdict.get_rhyme(target) == rhyme: return target return ch
def _parse_corpus(raw_file, json_file): print("Parsing %s ..." % raw_file, end=' ') #use in linux #sys.stdout.flush() rdict = RhymeUtil() data = [] with codecs.open(raw_file, 'r', 'utf-8') as fin: tags = fin.readline().strip().split('\t') line = fin.readline().strip() while line: toks = line.split('\t') poem = {'source': os.path.basename(raw_file)} for idx, tok in enumerate(toks): if tags[idx] != 'body': poem[tags[idx]] = tok else: body = tok flag = True left = body.find('(') while left >= 0: right = body.find(')') if right < left: flag = False break else: body = body[:left] + body[right + 1:] left = body.find('(') if flag and body.find(')') < 0: poem['sentences'] = split_sentences(body) for sentence in poem['sentences']: if not reduce(lambda x, ch: x and rdict.has_char(ch), sentence, True): flag = False break if flag: data.append(poem) line = fin.readline().strip() with codecs.open(json_file, 'w', 'utf-8') as fout: json.dump(data, fout) print("Done (%d poems)" % len(data)) return data
def _parse_couplet(raw_file, json_file): import ipdb print "Parsing %s ..." % raw_file, sys.stdout.flush() rdict = RhymeUtil() data = [] with codecs.open(raw_file, 'r', 'utf-8') as fin: line1 = fin.readline().strip() line2 = fin.readline().strip() while line1 and line2: poem = {'source': os.path.basename(raw_file)} sentence = [line1] sentence.append(line2) poem['sentences'] = sentence data.append(poem) line = fin.readline().strip() line1 = fin.readline().strip() line2 = fin.readline().strip() with codecs.open(json_file, 'w', 'utf-8') as fout: json.dump(data, fout) print "Done (%d poems)" % len(data) return data