def read_kidsfaq2w(limit=10): # filename = getLocalFile(KIDS_2W_FILENAME) # list_json = libfile.file2list(filename) # list_query = [] # for item in list_json: # item = json.loads(item) # q = item["_source"]["question"] # if "@" not in q: # list_query.append(q) # libfile.lines2file(list_query, getLocalFile(KIDS_2W_QUERY_FILENAME)) list_query = libfile.file2list(getLocalFile(KIDS_2W_QUERY_FILENAME)) print "Length of kidsfaq2w ", len(list_query) random.shuffle(list_query) return list_query[0: limit if limit<len(list_query) else len(list_query)]
def run_gen_url_search_realtime(self, filename): lines = libfile.file2list(filename) visited = set() for line in sorted(lines): for query_parser in [0]: query_url, qword = zhidao_fetch.get_search_url_qword( line, query_parser=query_parser) if query_url in visited: continue visited.add(query_url) print qword, query_url print len(visited) filename_output = getLocalFile( os.path.basename(filename.replace("human.txt", "_urls.txt"))) libfile.lines2file(sorted(list(visited)), filename_output)
def init_from_json(self): map_items = {} dirname = getLocalFile("raw/chat0708/*") for filename in glob.glob(dirname): src = os.path.basename(filename).replace(".txt", "") for line in libfile.file2list(filename): gcounter["total_" + src] += 1 item = json.loads(line) item["source"] = src item["answers"] = clean_answer(item["answers"]) item["question"] = clean_question(item["question"]) if len(item["answers"]) < 2: gcounter["items_skip_empty_answer"] += 1 continue label = "" if not label: label = self.api_nlp.detect_skip_words(item["answers"]) if label: label = u"敏感词:{}".format(u",".join(label)) print label, item["answers"] gcounter["minganci_answer"] += 1 else: label = "" if not label: if re.search("^[0-9\-]$", item["answers"]): label = "number" item["label"] = label q = item["question"] if q not in map_items: map_items[q] = item gcounter["from_" + src] += 1 else: map_items[q] = item gcounter["overwrite_" + src] += 1 print "overwrite", q, src, map_items[q]["answers"], item[ "answers"] gcounter["init_from_json"] = len(map_items) filename = getLocalFile("temp/qa0708chat.xls") items = sorted(map_items.values(), key=lambda x: x["question"]) libfile.writeExcel(items, ["label", "question", "answers", "source"], filename)
def run_get_best_search_realtime(self, filename): results = [] counter = collections.Counter() lines = libfile.file2list(filename) for query_parser in [0]: for line in sorted(lines): cnt_label = "query_{}".format(query_parser) if counter[cnt_label] % 10 == 0: print datetime.datetime.now().isoformat( ), counter[cnt_label], line counter[cnt_label] += 1 ret_one = search_zhidao_best(line, query_filter=0, query_parser=query_parser) if ret_one: item = ret_one["best_qapair"] print "=====>", line print "------", item["match_score"], item["question"] print item["answers"], "*******", item["answers_raw"][ len(item["answers"]):] for p in ["query"]: item[p] = ret_one[p] #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True) results.append(item) for p in ["source", "result_index"]: counter["{}_{}".format(p, item[p])] += 1 for p in ["question", "answers"]: if p in item: if not isinstance(item[p], unicode): item[p] = item[p].decode("gb18030") filename_output = getLocalFile( os.path.basename(filename.replace("human.txt", "xls"))) libfile.writeExcel(results, [ "id", "source", "result_index", "cnt_like", "cnt_answer", "query", "question_id", "question", "answers" ], filename_output) #libfile.writeExcel(results, ["query", "source", "cnt_like", "cnt_answer", "question", "answers"], filename_output) print counter
def index_xianer12w_test(self): dataset_index = "xianer12w_test" filename = getLocalFile("input/chat8xianer12w.txt") visited = set() for line in libfile.file2list(filename): if line in visited: continue visited.add(line) gcounter["lines"] += 1 item = { "question": line, "answers": u"无语", "id": es_api.gen_es_id(line) } self.upload(dataset_index, item) self.upload(dataset_index)
def clean_cmu(): dirname = getLocalFile("raw/cmu/*.txt") #print dirname lines = set() seq = [] counter = collections.Counter() for filename in glob.glob(dirname): counter["files"]+=1 for line in libfile.file2list(filename): zhstr = libdata.extract_zh(line) counter["lines"]+=1 if zhstr and len(zhstr)>1: counter["occurs"]+=1 #print zhstr seq.append(zhstr) lines.add(zhstr) print len(lines) filename_output = getLocalFile("output/cmu6w.txt") libfile.lines2file(sorted(list(lines)), filename_output) print len(seq) filename_output = getLocalFile("output/cmu6w_seq.txt") libfile.lines2file(seq, filename_output)
def fetch_detail(worker_id=None, worker_num=None, limit=None, config_index="prod", filename_input=None, fetch_option="top_n", fetch_limit=100): flag_batch = (worker_id is not None and worker_num is not None and worker_num>1) flag_prod = (config_index == "prod") flag_slack = (flag_prod and worker_id == 0) job_name = os.path.basename(filename_input).replace(".txt","") output_dir = "output0623" if flag_batch: filename_output_xls = getLocalFile("{}/{}.{}_worker.xls".format(output_dir, job_name, worker_id)) filename_output_xls2 = getLocalFile("{}/{}.{}_worker_query.xls".format(output_dir, job_name, worker_id)) filename_output_json = getLocalFile("{}/{}.{}_worker.json.txt".format(output_dir, job_name, worker_id)) else: filename_output_xls = getLocalFile("{}/{}.batch_{}.all.xls".format(output_dir, job_name, config_index)) filename_output_xls2 = getLocalFile("{}/{}.batch_{}.all_query.xls".format(output_dir, job_name, config_index)) filename_output_json = getLocalFile("{}/{}.batch_{}.all.json.txt".format(output_dir, job_name, config_index)) CONFIG ={ "local":{ "batch_id": "zhidao-search0623-20160621", "crawl_http_method": "get", "crawl_gap": 3, "crawl_use_js_engine": False, "crawl_timeout": 10, "crawl_http_headers": {}, "note": "知道搜索,闲聊", "debug": True, # "cache_server": "http://52.196.166.54:8000" #内网IP # "cache_server": "http://52.196.166.54:8000" "cache_server": "http://192.168.1.179:8000" }, "prod":{ "batch_id": "zhidao-search0623-20160621", "crawl_http_method": "get", "crawl_gap": 5, "crawl_use_js_engine": False, "crawl_timeout": 10, "crawl_http_headers": {}, "note": "知道搜索,闲聊", "debug": False, } } print filename_input if not filename_input: print "FATAL " return else: list_query = libfile.file2list(filename_input) print "Length of kidsfaq2w ", len(list_query) config = CONFIG[config_index] #config = {} api = ZhidaoFetch(config) ts_start = time.time() ts_lap_start = time.time() counter = collections.Counter() if limit: step = len(list_query)/limit list_query = [list_query[i*step] for i in range(limit)] print len(list_query) if flag_slack: slack_msg( u"AWS {}/{}. run {} batch_id: {}, urls: {} debug: {}".format( worker_id, worker_num, config["note"], config["batch_id"], len(list_query), config.get("debug",False)) ) results = [] with codecs.open(filename_output_json, 'w') as fjson: for query in list_query: if counter["visited"] % 1000 ==0: print datetime.datetime.now().isoformat(), counter counter["visited"]+=1 if flag_batch: if (counter["visited"] % worker_num) != worker_id: counter["skipped_peer"]+=1 continue counter["processed"]+=1 if counter["processed"] % 1000 == 0: if flag_slack: slack_msg( "AWS {}/{}. working {}. lap {} seconds. {}".format( worker_id, worker_num, config["batch_id"], int( time.time() - ts_lap_start ), json.dumps(counter) )) ts_lap_start = time.time() if "search_all" == fetch_option: ret = api.search_all(query, limit = fetch_limit) else: ret = fn_fetch(query ) if ret: ret["query"] = query fjson.write(u"{}\n".format(json.dumps(ret, ensure_ascii=False))) if ret and ret.get("items"): counter["has_result"] +=1 counter["total_qa"] += len(ret["items"]) if config.get("debug"): print len(ret["items"]), json.dumps(ret, ensure_ascii=False) for item in ret["items"]: #print json.dumps(item, ensure_ascii=False, indent=4, sort_keys=True) results.append(item) item["query"] = query for p in ["source"]: counter["{}_{}".format(p, item[p])] +=1 for p in [ "question", "answers"]: if p in item: if not isinstance(item[p], unicode): item[p] = item[p].decode("gb18030") else: counter["missing_data"] +=1 pass #libfile.writeExcel(results, [ "id", "source", "result_index", "cnt_like", "cnt_answer", "query", "question_id", "question", "answers"], filename_output_xls) #libfile.writeExcel(results, [ "id","is_good", "match_score", "result_index", "cnt_like", "cnt_answer", "query", "question", "answers"], filename_output_xls, page_size=5000) #print filename_output_xls # libfile.writeExcel(results, [ "label","query", "answers", "match_score", "question"], filename_output_xls) libfile.writeExcel(results, [ "label","question", "answers"], filename_output_xls) libfile.writeExcel(results, [ "label","query", "answers", "match_score", "question"], filename_output_xls2) duration_sec = int( time.time() -ts_start ) print "all done, seconds", duration_sec, duration_sec/counter["visited"], counter if flag_slack: slack_msg( "AWS {}/{}. done {}. total {} seconds".format( worker_id, worker_num, config["batch_id"], duration_sec) )
def test(self, dataset_index="chat8xianer12w", option="query"): filename_todo = getLocalFile("input/{}_todo.txt".format(option)) print "filename_todo", filename_todo q_todo = set() if os.path.exists(filename_todo): q_todo = libfile.file2set(filename_todo) gcounter["q_todo"] = len(q_todo) print "filename_todo", filename_todo, len(q_todo) filename_skip = getLocalFile("input/{}_skip.txt".format(option)) print "filename_skip", filename_skip q_skip = set() if os.path.exists(filename_skip): q_skip = libfile.file2set(filename_skip) gcounter["q_skip"] = len(q_skip) print "filename_skip", filename_skip, len(q_skip) data = {} q_all = set() dirname = getLocalFile( "output0623/{}*worker*json.txt".format(dataset_index)) for filename in glob.glob(dirname): print filename gcounter["files"] += 1 for line in libfile.file2list(filename): entry = json.loads(line) query = entry["query"] #print entry.keys() if "items_all" not in entry: gcounter["selected_no_data"] += 1 continue elif len(entry["items_all"]) == 0: gcounter["selected_no_item"] += 1 continue if q_skip and query in q_skip: gcounter["items_skip"] += 1 q_all.add(query) continue if self.api_nlp.detect_skip_words(query): gcounter["selected_query_skipwords"] += 1 q_all.add(query) continue items_select = self.api_nlp.select_qapair_0624( query, entry["items_all"]) if items_select: gcounter["selected_yes"] += 1 q_all.add(query) else: gcounter["selected_no"] += 1 for item in items_select: item["id"] = es_api.gen_es_id(item["question"] + item["answers"]) if item["id"] in data: continue label = self.filter_qa_by_label("", item["question"], item["answers"]) if label: item["label"] = label else: item["label"] = u"" xlabel = re.sub(":.*$", "", item["label"]) gcounter["data_with_label_{}".format(xlabel)] += 1 gcounter["items"] += 1 data[item["id"]] = item #ret = libfile.readExcel(["category","question","answers"], filename, start_row=1) if q_todo: q_todo.difference_update(q_all) filename_output = getLocalFile( "edit0623/query_miss_{}.xls".format(option)) libfile.lines2file(sorted(list(q_todo)), filename_output) gcounter["q_all"] = len(q_all) gcounter["q_miss"] = len(q_todo)