def buildGraph(dbName): db=MongoStackExchange(host='10.1.1.9',port=50000) db.useDB(dbName) links=db.stackdb.get_collection("postlinks") allLinks=list(links.find().batch_size(args.batch_size)) myG={} for link in tqdm.tqdm(allLinks,desc="building graph from links"): id_a,id_b=link["PostId"],link["RelatedPostId"] r=link["LinkTypeId"] if r==3: w=0 elif r==1: w=1 else: raise ValueError("unexpected value {} for link type".format(r)) if id_a in myG: myG[id_a][id_b]=w else: myG[id_a]={id_b:w} if id_b in myG: myG[id_b][id_a]=w else: myG[id_b]={id_a:w} logger.info("finished finding {} sublinks".format(len(allLinks))) return myG
def test3(): from post_rec.DataSet.DBLoader import MongoStackExchange from post_rec.Utility.TextPreprocessing import PreprocessPostContent processor = PreprocessPostContent() db = MongoStackExchange(host='10.1.1.9', port=50000) dbName = 'stackoverflow' db.useDB(dbName) count = 0 threshold = 0.2 verbose = 0 for q in db.questions.find().batch_size(10000): txt = q['Title'] + q['Body'] codes = ' '.join(processor.getCodeSnippets(txt)) if len(codes) and verbose < 10: print(len(codes), len(txt)) verbose += 1 if len(codes) / len(txt) > threshold: count += 1 print("code question is {}/{}".format(count, db.questions.count())) count = 0 for ans in db.answers.find().batch_size(10000): txt = ans['Body'] codes = ' '.join(processor.getCodeSnippets(txt)) if len(codes) and verbose < 10: print(len(codes), len(txt)) verbose += 1 if len(codes) / len(txt) > threshold: count += 1 print('code answer is {}/{}'.format(count, db.answers.count()))
def init(tokenizer_class): global PROCESS_TOK, PROCESS_DB PROCESS_TOK = tokenizer_class() Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100) PROCESS_DB = MongoStackExchange(host='10.1.1.9', port='36666') PROCESS_DB.useDB(dbName) Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)
def distinctMoveStack(src, dst): db = MongoStackExchange("10.1.1.9", "36666") db.useDB("stackoverflow") id_set = set() dst_collection = db.stackdb[dst] src_collection = db.stackdb[src] insert_cache = [] print("count before distinct", src_collection.count()) for doc in src_collection.find().batch_size(10000): if doc["Id"] in id_set: continue id_set.add(doc["Id"]) insert_cache.append(doc) if len(insert_cache) % 10000 == 0: dst_collection.insert_many(insert_cache) insert_cache.clear() print("process {}/{}".format(dst_collection.count(), src_collection.count())) if len(insert_cache) > 0: dst_collection.insert_many(insert_cache) print("process {}/{}".format(dst_collection.count(), src_collection.count())) print("count after distinct", dst_collection.count())
def buildGraph(dbName): db = MongoStackExchange(host='10.1.1.9', port=50000) db.useDB(dbName) links = db.stackdb.get_collection("postlinks") allLinks = list(links.find().batch_size(args.batch_size)) G = nx.Graph() myG = {} for link in tqdm.tqdm(allLinks, desc="building graph from links"): id_a, id_b = link["PostId"], link["RelatedPostId"] r = link["LinkTypeId"] if r == 3: w = 0 elif r == 1: w = 1 else: raise ValueError("unexpected value {} for link type".format(r)) G.add_edge(id_a, id_b, weight=w) if id_a in myG: myG[id_a][id_b] = w else: myG[id_a] = {id_b: w} if id_b in myG: myG[id_b][id_a] = w else: myG[id_b] = {id_a: w} logger.info("finished finding {} sublinks".format(len(allLinks))) logger.info("graph size of edges({}) and nodes({})".format( len(list(G.edges)), len(list(G.nodes)))) if len(G.nodes) < 1e+4: return [G], G else: logger.info("cutting graph into small blocks") graphs = [] for cc in nx.connected_components(G): g = G.subgraph(cc) graphs.append(g) graphs.sort(key=lambda g: len(g.nodes), reverse=True) logger.info("num of subGs:{}".format(len(graphs))) subnodes = list(map(lambda g: len(g.nodes), graphs))[:10] logger.info("nodes of subG(top10):{}".format(subnodes)) return graphs, G
def genResults(): Qids = readQueryId("../../dataCases/query_list.txt") Summy = {} i = 0 for i in range(100): Summy[Qids[i]] = readSummary("../../dataCases/Summary_list/%d.txt" % i) print(len(Qids), len(Summy)) print(Summy) Answers = {} processor = PreprocessPostContent() docDB = MongoStackExchange(host="10.1.1.9", port=50000) docDB.useDB("stackoverflow") for qid in Qids: question = docDB.questions.find_one({"Id": qid}) if not question: print("None Error", qid, question) continue #print(question) if "AcceptedAnswerId" in question and question["AcceptedAnswerId"]: ans = docDB.answers.find_one({"Id": question["AcceptedAnswerId"] })["Body"] else: answers = docDB.answers.find({"ParentId": qid}) answers = list(answers) if len(answers) < 1: print("Error!", qid) continue answers.sort(key=lambda x: x["Score"], reverse=True) ans = answers[0]["Body"] ans = processor.getPlainTxt(ans) ans = " ".join(ans) Answers[qid] = {"true": ans, "generated": Summy[qid]} print(len(Answers), Answers[qid]) #break with open("../../dataCases/answers.json", "w") as f: import json json.dump(Answers, f)
labelDataNew = [] for ld in labelData: id1, id2 = ld["pair"] if id1 not in q_ids_set or id2 not in q_ids_set: continue labelDataNew.append(ld) labels = map(lambda ll: ll["label"], labelData) import collections logger.info(collections.Counter(labels)) generateQuestionCorpus(labelDataNew, postData_local) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--batch_size', type=int, default=1000) parser.add_argument('--source', type=str, default="crossvalidated") args = parser.parse_args() docDB = MongoStackExchange(host='10.1.1.9', port=50000) docDB.useDB("posts") logger.info("task source is {}".format(args.source)) main()
def initDB(dbName): db=MongoStackExchange(host='10.1.1.9',port=50000) db.useDB(dbName) return db
if len(cache)>0: f.writelines(cache) cache.clear() def main(): questionsDataGlobal=fetchQuestionData() answersDataGlobal=fetchAnswerData(questionsDataGlobal.keys()) indexerDataGlobal=fetchIndexData(questionsDataGlobal.keys()) generateContextAnswerCorpusParallel(questionsDataGlobal,answersDataGlobal,indexerDataGlobal) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--batch_size', type=int, default=1000) parser.add_argument('--db', type=str, default="crossvalidated") parser.add_argument('--workers', type=int, default=10) args = parser.parse_args() docDB=MongoStackExchange(host='10.1.1.9',port=50000) dbName=args.db docDB.useDB(dbName) logger.info("processing db data: {}".format(dbName)) main()
with open(seq2seq_sample_file_dst, "w") as f: f.writelines(dataDst) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--batch_size', type=int, default=1000) parser.add_argument('--db', type=str, default="corpus") parser.add_argument('--maxSize', type=int, default=-1) parser.add_argument('--task', type=str, default="seq2seq") parser.add_argument('--contextLen', type=int, default=312) parser.add_argument('--questionLen', type=int, default=100) parser.add_argument('--answerLen', type=int, default=100) args = parser.parse_args() docDB = MongoStackExchange(host='10.1.1.9', port=50000) docDB.useDB(args.db) if args.task == "inference": logger.info("task is " + args.task) inferenceGen() if args.task == "seq2seq": logger.info("task is " + args.task) seq2seqGen() if args.task == "knowNet": logger.info("task is " + args.task) knowNetGen()