def add_coauthor_relation(begin,end,msg): ''' coauthor times and coauthor relationships :return: mag_authors0411: {coauthor_counts:n} {coauthor_list:[{year:1999,id:1000000},year:1998,id:1000001}]} ''' start_time = time() print(start_time) col1 = connectTable("qiuzh", "mag_papers0415") col2 = connectTable("qiuzh", "mag_authors0411") operation = [] cursor = col2.find(no_cursor_timeout=True)[begin:end] for i in cursor: author_id = i["_id"] coauthor_times = 0 coauthor_list = [] papers = i["new_pubs"] for paper in papers: paper_details = col1.find_one({"_id": paper}) # if paper_details: coauthor_times += (len(paper_details["authors"]) - 1) for author in paper_details["authors"]: if author["id"] != author_id: coauthor_list.append({"coauthor_id": author["id"], "coauthor_time": paper_details["year"]}) if len(coauthor_list)>0: operation.append(pymongo.UpdateOne({"_id": author_id}, {"$set": {"coauthor_counts": coauthor_times, "coauthor": coauthor_list}})) print(msg,"线程已完成",len(operation),flush=True) col2.bulk_write(operation, ordered=False) cursor.close() print(msg,time(), (time() - start_time))
def author_citation_number(begin, end, msg): ''' this function is appropriate for mag_authors0510 and citation network0515 :return: ''' colpaper = connectTable("qiuzh", "mag_papers0510") col_author = connectTable("qiuzh", "mag_authors0510") count = 0 operation = [] cursor = col_author.find(no_cursor_timeout=True)[begin:end] for author in cursor: count += 1 author_id = author["_id"] citation_number = 0 for paper in author["new_pubs"]: p = colpaper.find_one({"_id": paper["pid"]}, no_cursor_timeout=True) citation_number += p["cn"] operation.append( pymongo.UpdateOne({"_id": author_id}, {"$set": { "cn": citation_number }})) if count % 10000 == 0: print(msg, "已处理:", count / 10000, flush=True) col_author.bulk_write(operation, ordered=False) print(msg, "已写入:", count / 10000, flush=True) operation = [] print(time(), flush=True) if operation: col_author.bulk_write(operation, ordered=False) cursor.close()
def clone_collection(): coll = connectTable("oga_one", "mag_paper_plus2") # col2 = connectTable("qiuzh","MAG_authors") col3 = connectTable("qiuzh", "papers") for i in coll.find({"$and": [{"venue": {"$exists": True}}]}): col3.insert_one(i) print(col3.find().count())
def divide_researchers_into_2groups(): col_author = connectTable("qiuzh", "mag_researchers0810") col1 = connectTable("qiuzh", "researchers0810_trainingset") col2 = connectTable('qiuzh', "researchers0810_testset") opt1 = [] opt2 = [] count = 0 cursor = col_author.find(no_cursor_timeout=True) for researcher in col_author.find(): count += 1 if researcher["first_year"] <= 1996: opt1.append(pymongo.InsertOne(researcher)) else: opt2.append(pymongo.InsertOne(researcher)) if count % 10000 == 0: print("已处理:", count / 10000, flush=True) col1.bulk_write(opt1, ordered=False) print("已写入:", len(opt1), flush=True) col2.bulk_write(opt2, ordered=False) print("已写入:", len(opt2), flush=True) opt1 = [] opt2 = [] if opt1: col1.bulk_write(opt1, ordered=False) print("又写入:", len(opt1), flush=True) if opt2: col2.bulk_write(opt2, ordered=False) print("又写入:", len(opt2), flush=True) cursor.close()
def match_v1_v2_id(begin, end, msg): coll = connectTable("qiuzh", "mag_papers") coll3 = connectTable('qiuzh', "mag_authors0409") opt = [] count = 0 cursor = coll3.find(no_cursor_timeout=True)[begin:end] for i in cursor: if count % 100000 == 0: print("线程: %s, 已完成 %s 万条" % (msg, count / 100000), flush=True) count += 1 v2author_id = i.get("_id") # print(v2author_id) new_pubs = [] papers = coll.find({"authors.id": v2author_id}) for paper in papers: id = paper.get("_id") new_pubs.append(id) opt.append( pymongo.UpdateOne({"_id": v2author_id}, {"$set": { "new_pubs": new_pubs }})) cursor.close() coll3.bulk_write(opt, ordered=False) print("线程: %s, 遍历了 %s" % (msg, len(opt)))
def researchers_con(): ''' the coauthor times based on the mag_authors0510 :return: ''' col1 = connectTable('qiuzh', "mag_authors0510") col2 = connectTable('qiuzh', "mag_researchers0707") count = 0 operation = [] cursor = col2.find(no_cursor_timeout=True) for author in cursor: count += 1 author_id = author["_id"] coauthor_number = col1.find_one({"_id": author_id})["con"] operation.append( pymongo.UpdateOne({"_id": author_id}, {"$set": { "con": coauthor_number }})) if count % 10000 == 0: print("已处理:", count / 10000, flush=True) col2.bulk_write(operation, ordered=False) print("已写入:", count / 10000, flush=True) operation = [] if operation: col2.bulk_write(operation, ordered=False) print("又处理", len(operation)) cursor.close()
def paper_citation_number(begin, end, msg): ''' this function is appropriate for citation_network0515 and mag_papers0510 :return: add each papers' total citation in mag_papers0510 ''' colpaper = connectTable("qiuzh", "mag_papers0510") col_citation_network = connectTable("qiuzh", "citation_network0810_trainingset") count = 0 operation = [] cursor = colpaper.find(no_cursor_timeout=True)[begin:end] for paper in cursor: count += 1 paper_id = paper["_id"] citation_number = 0 paper_citation_relations = col_citation_network.find( {"id": paper_id}, no_cursor_timeout=True) if paper_citation_relations: for paper_citation_relation in paper_citation_relations: citation_number += len(paper_citation_relation["citation"]) operation.append( pymongo.UpdateOne({"_id": paper_id}, {"$set": { "cn_before1996": citation_number }})) if count % 10000 == 0: print(msg, "已处理:", count / 10000, flush=True) colpaper.bulk_write(operation, ordered=False) print(msg, "已写入:", count / 10000, flush=True) operation = [] print(time(), flush=True) if operation: colpaper.bulk_write(operation, ordered=False) cursor.close()
def clone_author_collection(): coll = connectTable("academic", "mag_authors") col2 = connectTable("qiuzh", "mag_authors0409") for i in coll.find({"id": {"$exists": True}}): if "pubs" in i.keys(): new_document = {} new_document["_id"] = i["id"] new_document["pubs"] = i["pubs"] col2.insert_one(new_document) print(col2.find().count())
def filter_papers_by_JCR(): ''' 把col1中的paper过滤到col2中,所有的paper的期刊必须都在JCR中出现过,即有field字段 :return: ''' col1 = connectTable("qiuzh", "mag_papers0415") col2 = connectTable("qiuzh", "mag_papers0510") cursor = col1.find({"field": {"$exists": True}}, no_cursor_timeout=True) for i in cursor: col2.insert_one(i)
def filter_authors_by_papers(): ''' save authors who have new pubs in mag_authors0411 :return: ''' col1 = connectTable("qiuzh", "mag_authors0409") col2 = connectTable('qiuzh', "mag_authors0411") for i in col1.find(): if i["new_pubs"]: col2.insert_one(i)
def clone_paper_collection(): coll = connectTable("oga_one", "mag_paper") col2 = connectTable("qiuzh", "mag_papers") for i in coll.find({"id": {"$exists": True}}): if "new_authors" in i.keys() and "year" in i.keys() and "references" in i.keys(): new_document = {} new_document["_id"] = i["id"] new_document["authors"] = i["new_authors"] new_document["venue"] = i["new_venue"] new_document["year"] = i["year"] new_document["references"] = i["references"] col2.insert_one(new_document) print(col2.find().count())
def researchers_collaboration_network(): ''' there are some problems in researchers_con_innewcollection network, so we may use the other method to replace it i.e create a collaboration network first. :param begin: :param end: :param msg: :return: ''' start_time = time() print(start_time, flush=True) col1 = connectTable("qiuzh", "mag_papers0510") col2 = connectTable("qiuzh", "mag_researchers0707") col3 = connectTable("qiuzh", "coauthor_network0722") operation = [] cursor = col2.find(no_cursor_timeout=True) count = 0 for i in cursor: count += 1 author_id = i["_id"] # coauthor_times = 0 # coauthor_list = [] papers = i["new_pubs"] for paper in papers: paper_details = col1.find_one({"_id": paper}, no_cursor_timeout=True) for author in paper_details["authors"]: if author["id"] != author_id and col2.find_one( {"_id": author["id"]}, no_cursor_timeout=True): # coauthor_list.append({"coauthor_id": author["id"], "coauthor_time": paper_details["year"]}) operation.append( pymongo.InsertOne({ "author_id": author_id, "coauthor_id": author["id"], "coauthor_time": paper_details["year"], })) if count % 10000 == 0: print("已处理:", count / 10000, flush=True) col3.bulk_write(operation, ordered=False) print("已写入:", count / 10000, flush=True) operation = [] print(time(), flush=True) if operation: col3.bulk_write(operation, ordered=False) print("已完成", len(operation), flush=True) print(time(), (time() - start_time), flush=True)
def add_coauthor_relation2newcollection(): ''' coauthor times and coauthor relationships :return: mag_authors0411: {coauthor_counts:n} {coauthor_list:[{year:1999,id:1000000},year:1998,id:1000001}]} because some of the authors in the dataset have too many collaborations and exceed the maximum RAM of a document, we store the relation in a new collection _id: "author_id" : "coauthor_id": "coauthor_time": ''' start_time = time() print(start_time,flush=True) col1 = connectTable("qiuzh", "mag_papers0415") col2 = connectTable("qiuzh", "mag_authors0411") col3 = connectTable("qiuzh", "coauthor_network0420") operation = [] cursor = col2.find(no_cursor_timeout=True)[3790001:] count =0 for i in cursor: count+=1 author_id = i["_id"] # coauthor_times = 0 # coauthor_list = [] papers = i["new_pubs"] for paper in papers: paper_details = col1.find_one({"_id": paper}) # if paper_details: # coauthor_times += (len(paper_details["authors"]) - 1) for author in paper_details["authors"]: if author["id"] != author_id: # coauthor_list.append({"coauthor_id": author["id"], "coauthor_time": paper_details["year"]}) operation.append(pymongo.InsertOne( {"author_id": author_id, "coauthor_id": author["id"], "coauthor_time": paper_details["year"], })) if count % 10000 == 0: print("已处理:", count / 10000, flush=True) col3.bulk_write(operation, ordered=False) print("已写入:", count / 10000, flush=True) operation = [] print(time(), flush=True) if operation: col3.bulk_write(operation, ordered=False) print("已完成",len(operation),flush=True) print(time(), (time() - start_time), flush=True)
def filter_researchers_paper_by_authors(): ''' from mag_researchers0707(pubs>=10, academic career life >=10) to mag_researchers0810(only the author number of a paper less than 10 will be considered in the dataset) :param msg: :param begin: :param end: :return: this function is created in 2021.8.10 ''' # col2 = connectTable('qiuzh', "mag_researchers0707") # col2.drop() col1 = connectTable('qiuzh', "mag_researchers0707") col2 = connectTable('qiuzh', "mag_researchers0810") col_paper = connectTable("qiuzh", "mag_papers0510") cursor = col1.find(no_cursor_timeout=True) opt = [] count = 0 print(cursor.count()) for i in cursor: count += 1 pubs = i["new_pubs"] new_pubs = [] for pub in pubs: paper = col_paper.find_one({"_id": pub["pid"]}) if len(paper["authors"]) <= 10: new_pubs.append(pub) opt.append( pymongo.InsertOne({ "_id": i["_id"], "new_pubs": new_pubs, "pub_count": i["pub_count"], "first_year": i["first_year"], "last_year": i["last_year"], "cn": i["cn"] })) if count % 10000 == 0: print(len(opt)) print(count) print("已处理:", count / 10000, flush=True) col2.bulk_write(opt, ordered=False) print("已写入:", count / 10000, flush=True) opt = [] if opt: col2.bulk_write(opt, ordered=False) print("最终又完成", len(opt)) print(count) cursor.close()
def filter_author_by_careerlife(begin,end,msg): ''' :param msg: :param begin: :param end: :return: pubs>=10, org exist(affiliation) ''' col1 = connectTable('qiuzh', "mag_authors0421") col2 = connectTable('qiuzh', "mag_authors0411") cursor = col2.find(no_cursor_timeout=True)[begin:end] opt =[] for i in cursor: if i["first_year"]-i["last_year"]>=20: opt.append(pymongo.InsertOne({"_id":i["_id"],"new_pubs":i["new_pubs"],"pub_count":i["pub_count"],"first_year":i["first_year"],"last_year":i["last_year"]})) col1.bulk_write(opt,ordered=False) cursor.close()
def filter_papers_by_new_pubs(): ''' :param msg: :param begin: :param end: :return: new_pubs ''' col1 = connectTable("qiuzh","papers") col2 = connectTable('qiuzh', "MAG_authors") medset =set() for i in col2.find(): for j in i.get("new_pubs"): medset.add(j) print(len(medset)) result=col1.delete_many({"id": {"$nin":list(medset)}}) print(result.deleted_count) # 被删除的个数
def filter_author_by_citation(begin, end,msg): ''' :param msg: multi-process information :param begin: i-th :param end: i+1-th :return: pubs counts>=5 ''' col1 = connectTable("academic", "mag_authors") col2 = connectTable('qiuzh', "MAG_authors") opt = [] # count = 0 for i in col1.find({"n_pubs":{"$gte":5}})[begin: end]: a =i opt.append(pymongo.InsertOne(i)) col2.bulk_write(opt, ordered=False) print("线程: %s, 遍历了 %s" % (msg, len(opt)))
def filter_author_by_abstract(): ''' Some of the papers in the dataset are news and some of the authors are editor of journals, so we need to filter them by abstract :return: ''' col1 = connectTable("qiuzh", "mag_papers0415") print("okay")
def find_critical_year(): ''' 2020.8.31: critical_year is 1996, with 559808 rexearchers in total(more than half of the dataset) :return: ''' col_author = connectTable("qiuzh", "mag_researchers0810") year_list = [ 1802, 1803, 1810, 1814, 1815, 1816, 1819, 1823, 1825, 1827, 1828, 1829, 1830, 1832, 1833, 1834, 1836, 1838, 1839, 1841, 1842, 1843, 1844, 1845, 1846, 1847, 1848, 1849, 1850, 1851, 1852, 1853, 1854, 1855, 1856, 1857, 1858, 1859, 1860, 1861, 1862, 1863, 1864, 1865, 1866, 1867, 1868, 1869, 1870, 1871, 1872, 1873, 1874, 1875, 1876, 1877, 1878, 1879, 1880, 1881, 1882, 1883, 1884, 1885, 1886, 1887, 1888, 1889, 1890, 1891, 1892, 1893, 1894, 1895, 1896, 1897, 1898, 1899, 1900, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909, 1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018 ] # year_list = [1957, 1987] sum = 0 for year in year_list: researcher_number = col_author.count_documents({"first_year": year}) sum += researcher_number print(researcher_number, sum) if sum >= 541461: print(year)
def author_pubs_count(): ''' author_pubs_count :return: ''' coll = connectTable("qiuzh", "mag_authors0510") # coll = connectTable("qiuzh", "test1") a = coll.aggregate([{ "$group": { "_id": {}, "avg": { "$avg": "$pub_count" }, "max": { "$max": "$pub_count" }, "min": { "$min": "$pub_count" }, "sum": { "$sum": "$pub_count" }, } }]) for i in a: print(i)
def print_bsur_into_pc(): ''' in 8.30 we used this function to print the data in the txt, However, we do save the data into Bsur rather than Bsur0810 by mistake. :return: ''' col1 = connectTable('qiuzh', "mag_researchers0810") DI = [] KI = [] SI = [] for author in col1.find(): d_i = author["bsur"] k_i = author["ifdis"] s_i = author["iftop"] DI.append(d_i) KI.append(k_i) SI.append(s_i) print("list has loaded") data = open("C:/Users/qzh/PycharmProjects/MAG/datafile/Bsur0810.txt", "w+") for j in range(len(DI)): print(DI[j], file=data) data.close() data = open("C:/Users/qzh/PycharmProjects/MAG/datafile/Ifdis0810.txt", "w+") for j in range(len(KI)): print(KI[j], file=data) data.close() data = open("C:/Users/qzh/PycharmProjects/MAG/datafile/Iftop0810.txt", "w+") for j in range(len(SI)): print(SI[j], file=data) data.close()
def calculate_coauthor_times2(): ''' this version is appropriate for mag_authors0510 this function is not finished :return: ''' start_time = time() print(start_time) col1 = connectTable("qiuzh", "mag_authors0411") operation = [] count=0 cursor = col1.find(no_cursor_timeout=True) for i in cursor: count+=1 author_id = i["_id"] coauthor_times = col1.count({"author_id": author_id}) operation.append(pymongo.UpdateOne({"_id": author_id}, {"$set": {"coauthor_times": coauthor_times}})) if count % 10000 == 0: print("已处理:", count / 10000, flush=True) col1.bulk_write(operation, ordered=False) print("已写入:", count / 10000, flush=True) operation = [] print(time(), flush=True) if operation: col1.bulk_write(operation, ordered=False) cursor.close() print(time(), (time() - start_time))
def boot_strap(P_d): col_author = connectTable("qiuzh", "researchers0810_trainingset") cursor = col_author.find(no_cursor_timeout=True) count = 0 operation = [] for author in cursor: count += 1 coauthor_times = author["new_con"] author_id = author["_id"] d_i_list = np.random.binomial(coauthor_times, P_d, 20) surprisal_list = [] for di in d_i_list: P0 = stats.binom.sf(di - 1, coauthor_times, P_d) surprisal_list.append(-math.log(P0)) S = np.mean(surprisal_list) operation.append( pymongo.UpdateOne({"_id": author_id}, {"$set": { "bsur": S }})) if count % 10000 == 0: print("已处理:", count / 10000, flush=True) col_author.bulk_write(operation, ordered=False) print("已写入:", count / 10000, flush=True) operation = [] print(time(), flush=True) if operation: col_author.bulk_write(operation, ordered=False) print("又写入并完成", len(operation)) cursor.close() print(col_author.count_documents({"sur": -6})) print(col_author.count_documents({"dn": -1})) print(col_author.count_documents({"bsur": -6}))
def find_discoverer(maxbsur): col_author = connectTable("qiuzh", "researchers0810_trainingset") cursor = col_author.find(no_cursor_timeout=True) count = 0 operation = [] for author in cursor: count += 1 sur = author["sur"] author_id = author["_id"] if sur >= 0 and sur < maxbsur: operation.append( pymongo.UpdateOne({"_id": author_id}, {"$set": { "ifdis": 0 }})) else: operation.append( pymongo.UpdateOne({"_id": author_id}, {"$set": { "ifdis": 1 }})) if count % 10000 == 0: print("已处理:", count / 10000, flush=True) col_author.bulk_write(operation, ordered=False) print("已写入:", count / 10000, flush=True) operation = [] print(time(), flush=True) if operation: col_author.bulk_write(operation, ordered=False) print("又写入并完成", len(operation)) cursor.close()
def initialize_surprisal(): col_author = connectTable("qiuzh", "researchers0810_trainingset") cursor = col_author.find(no_cursor_timeout=True) # researcher_number = cursor.count() # print(researcher_number) count = 0 operation = [] for author in cursor: count += 1 operation.append( pymongo.UpdateOne({"_id": author["_id"]}, {"$set": { "sur": -6, "bsur": -6 }})) if count % 10000 == 0: print("已处理:", count / 10000, flush=True) col_author.bulk_write(operation, ordered=False) print("已写入:", count / 10000, flush=True) operation = [] if operation: col_author.bulk_write(operation, ordered=False) print("finished") cursor.close() print(count) print(col_author.find({"dn": -1}, no_cursor_timeout=True).count())
def initialize_discover_number(): ''' this function is used in 2021.8.12 in mag_researchers0810 in 2021.9.1 we used this function in researchers0810_trainingset :return: ''' col_author = connectTable("qiuzh", "researchers0810_trainingset") cursor = col_author.find(no_cursor_timeout=True) # researcher_number = cursor.count() # print(researcher_number) count = 0 operation = [] for author in cursor: count += 1 operation.append( pymongo.UpdateOne({"_id": author["_id"]}, {"$set": { "dn": -1 }})) if count % 10000 == 0: print("已处理:", count / 10000, flush=True) col_author.bulk_write(operation, ordered=False) print("已写入:", count / 10000, flush=True) operation = [] if operation: col_author.bulk_write(operation, ordered=False) print("finished") cursor.close() print(count) print(col_author.find({"dn": -1}, no_cursor_timeout=True).count())
def new_pub_count(begin, end, msg): col_author = connectTable("qiuzh", "researchers0810_trainingset") count = 0 operation = [] cursor = col_author.find(no_cursor_timeout=True)[begin:end] for author in cursor: count += 1 author_id = author["_id"] pub_count = 0 for paper in author["new_pubs"]: if paper["year"] <= 1996: pub_count += 1 operation.append( pymongo.UpdateOne({"_id": author_id}, {"$set": { "pub_count": pub_count }})) if count % 10000 == 0: print(msg, "已处理:", count / 10000, flush=True) col_author.bulk_write(operation, ordered=False) print(msg, "已写入:", count / 10000, flush=True) operation = [] print(time(), flush=True) if operation: col_author.bulk_write(operation, ordered=False) cursor.close()
def author_first_year_distribution(): ''' :return: ''' col = connectTable("qiuzh", "mag_authors0510") yearlist = col.distinct("first_year") print(yearlist)
def delete_coauthor_counts(): col = connectTable('qiuzh', "mag_authors0411") # cursor = col.find({"coauthor_counts":{"$exists": True}},no_cursor_timeout=True)[begin:end] # for i in cursor: # _id = i.get("_id") # col.update_one({"_id":_id}, {"$unset": {"new_pubs": 1}},False,True) col.update_many({"coauthor_counts": {"$exists": True}}, {"$unset": {"coauthor_counts": 1, "coauthor": 1}}) # cursor.close() print("yes okay")
def author_pubs_number(): mycol = connectTable("qiuzh", "mag_authors0510") for i in mycol.find(): author_id = i["_id"] pub_number = len(i["new_pubs"]) mycol.update_one({"_id": author_id}, {"$set": { "pub_count": pub_number }})