def combine_labeled_data(): fname_neg = DATA_ROOT + "filtered/bdv2/baidu_rel.tsvwb" fname_pos = DATA_ROOT + "filtered/bdv2/baidu_sim.tsvwb" fname_all = DATA_ROOT + "filtered/bdv2/baidu_qq.tsv" result = [] dict = set() cont1 = util.readlines_from_file(fname_neg) idx, tot = 0, 0 while idx < (len(cont1) - 1): vals = cont1[idx].split("\t") idx += 1 if not len(vals) == 2: continue w1 = cleanup(vals[0]) w2 = cleanup(vals[1]) if not check_negative(w1, w2): continue dict.add(w1) dict.add(w2) result.append(w1 + "\t" + w2 + "\t0\n") tot += 1 cont2 = util.readlines_from_file(fname_pos) idx = 0 while idx < min(len(cont2), tot): # not too many pos vals = cont2[idx].split("\t") idx += 1 if not len(vals) == 2: continue w1 = cleanup(vals[0]) w2 = cleanup(vals[1]) idx += 1 #if not w1 in dict and not w2 in dict: # continue result.append(w1 + "\t" + w2+ "\t1\n") random.shuffle(result) with codecs.open(fname_all, "w", "gbk") as fw: for res in result: fw.write(res) idx = 0 with codecs.open(fname_all+".new", "w", "gbk") as f1: while idx < 1*len(result): f1.write(result[idx]) idx += 1 '''with codecs.open(fname_all+".2", "w", "gbk") as f1:
def read_data(fname): lines = util.readlines_from_file(fname) data = [] for line in lines: vals = line.split(" ") data.append(vals[0].strip()) return data
def load(self): """ ファイル `crontab` を読み込む. """ self._judges = [] content = util.readlines_from_file(Crontab.FILENAME) for line in content: self._load_one_line(line) if len(self._judges)==0: log.info('The `crontab` file has no settings.')
def intercept(): idx = data_vote() data = util.readlines_from_file(DATA_ROOT + "filtered/bdv2/baidu_qq.tsv.new") fw = open(DATA_ROOT + "filtered/bdv2/join.tsv", "w") for i in range(len(idx)): line = idx[i].strip().replace("__label__", "") if len(line) == 0: continue label = data[i].strip().split("\t")[2] if label == line: fw.write(data[i] +"\n") fw.close()
def gen_label(): fname1 = "C:\\Workspace\\Data\\wenda\\filtered\\bdv2\\odmodel_newdata\\baidu_qq.tsv.new" fname2 = "C:\\Workspace\\Data\\wenda\\filtered\\bdv2\\odmodel_newdata\\pred.tsv2" data1 = util.readlines_from_file(fname1) data2 = util.readlines_from_file(fname2) c1, c2 = [], [] for d1 in data1: vals = d1.split("\t") if not len(vals) == 3: continue c1.append(vals[2]) for d2 in data2: vals = d2.split("\t") if not len(vals) == 2: continue c2.append(vals[1]) if not len(c1) == len(c2): print "wrong" with open(fname1 + ".label", "w") as fw: for i in range(len(c1)): fw.write(c1[i] + "\t" + c2[i] + "\n")
def iconv(f1, f2, enfrom="utf-8", ento="gbk"): with open(f2, "w") as fw: lines= util.readlines_from_file(f1) for line in lines: #line = line.strip().replace(" ", ",").replace("\n", ",") + "\n" line = re.sub(",+", ",", line) #if not "\t" in line or len(line) == 1 or line.count("\t") > 1: # continue try: s = change_encoding(line, enfrom, ento) except Exception as e: print(e.message, line) continue fw.write(s + "\n")
def get_qq_es(fname): data = util.readlines_from_file(fname) with open(fname + ".qq", "w") as fw: q1, q2 = "", "" for line in data: line = line.strip() if line.startswith("input:"): q1 = line[len("input:"):] else: if len(line) == 0: continue idx = line.rfind(" ") q2 = line[0:idx] fw.write(q1 + "\t" + q2 + "\t-1\n")
def proc_score(): fname = "C:\\Workspace\\Data\\wenda\\filtered\\bdv2\\odmodel_newdata\\pred.tsv" data = util.readlines_from_file(fname) with open(fname + "2", "w") as fw: for line in data: vals = line.split("\t") if len(vals) <= 1: continue lab = int(vals[0]) if lab == 1: fw.write(line + "\n") else: s1 = (float)(vals[1]) s2 = 1 - s1 fw.write("0\t" + str(s2) + "\n")
def gen_questions(): content = util.readlines_from_file(DATA_ROOT + "/qclick/wenda_q2q_v2.dat") dedup = set() with open(DATA_ROOT + "/qclick/wenda_questions.txt", "w") as fw: for line in content: vals = line.split("\t") if not len(vals) == 2: continue if len(vals[1]) <= wenda_proc.MIN_COUNT_Q or len(vals[1]) >= wenda_proc.MAX_COUNT_Q: continue if vals[1] in dedup: #print("dup " + vals[1]) continue dedup.add(vals[1]) fw.write(vals[1] + "\n")