def getlists(): ad = load_articledict() lists = [a for a in ad if "List_of" in a] ids = json.load(open(DUMPP + "/article_ids.json", "r")) linkdict = {} with open(DUMPP + "/pagelinks.csv", "r", encoding="utf-8") as f: reader = csv.reader(f, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator='\n') x = 0 for row in reader: x += 1 if x % 100000 == 0: print(x) from_id = row[0] to_title = row[1] if from_id not in ids: continue from_title = ids[from_id] if from_title in lists: if from_title not in linkdict: linkdict[from_title] = [] if to_title in ad: linkdict[from_title].append(to_title) with open(DATAP + "/listlinks.json", "w", encoding="utf-8") as f: json.dump(linkdict, f)
def plot_negative_seed_depth(ton): langdict = load_articledict() fig, ax = plt.subplots(nrows=1, ncols=1) depthlist = [] for c in ROOTS: depthlist.append(list(map(lambda d: len([cl for cl in langdict if ((c + "Depth" in langdict[cl]) and (langdict[cl][c + "Depth"] == d)) and langdict[cl]["negativeSeed"] == 1]) , range(ton)))) csvtext = "" for n in range(ton): csvtext += str(n) for i in range(len(ROOTS)): csvtext += ", " + str(depthlist[i][n]) csvtext += "\n" dtypes = dict() dtypes["depth"] = int for c in ROOTS: dtypes[c] = int df = read_csv(StringIO(csvtext), delimiter=',', names=["depth"] + ROOTS, dtype=dtypes) print(df.to_latex) df.plot(x="depth", y=ROOTS, kind="bar", ax=ax, logy=False, width=0.8, color=["red", "green", "blue"]) ax.set_title('Negative Seed Distribution') for p in ax.patches: ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005)) plt.show()
def solo(self, backup=True): articledict = load_articledict() if backup: backup_articledict(articledict) t = start_time() articledict = self.check(articledict) stop_time(t) save_articledict(articledict)
def solo(self): articledict = load_articledict() with open(DATAP + '/catdict.json', 'r', encoding="UTF8") as f: cd = load(f) cd = self.check(cd, articledict) with open(DATAP + '/catdict.json', 'w', encoding="UTF8") as f: dump(obj=cd, fp=f, indent=2) f.flush()
def plot_seed_depth(ton): langdict = load_articledict() #fig, ax = plt.subplots(nrows=1, ncols=1) depthlist = [] for c in ROOTS: depthlist.append( list( map( lambda d: len([ cl for cl in langdict if ((c + "Depth" in langdict[cl]) and (langdict[cl][ c + "Depth"] == d)) and langdict[cl]["Seed"] == 1 ]), range(ton)))) csvtext = "" for n in range(ton): csvtext += str(n) for i in range(len(ROOTS)): csvtext += ", " + str(depthlist[i][n]) csvtext += "\n" dtypes = dict() dtypes["depth"] = int for c in ROOTS: dtypes[c] = int df = read_csv(StringIO(csvtext), delimiter=',', names=["depth"] + ROOTS, header=None, dtype=dtypes, index_col=0) #df = df.set_index('depth') print(df.T.to_latex()) plt.rc('xtick', labelsize=30) plt.rc('ytick', labelsize=30) font = {'family': 'normal', 'weight': 'bold', 'size': 42} plt.rc('font', **font) ax = df.plot(y=ROOTS, kind="bar", logy=True, width=0.9, color=["red", "green", "blue"]) ax.set_title('Seed articles per depth.') #for p in ax.patches: # ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005)) plt.show()
def plot_negative_seed_depth(ton): langdict = load_articledict() fig, ax = plt.subplots(nrows=1, ncols=1) depthlist = [] for c in ROOTS: depthlist.append( list( map( lambda d: len([ cl for cl in langdict if ((c + "Depth" in langdict[cl]) and (langdict[cl][c + "Depth"] == d)) and langdict[cl][ "negativeSeed"] == 1 ]), range(ton)))) csvtext = "" for n in range(ton): csvtext += str(n) for i in range(len(ROOTS)): csvtext += ", " + str(depthlist[i][n]) csvtext += "\n" dtypes = dict() dtypes["depth"] = int for c in ROOTS: dtypes[c] = int df = read_csv(StringIO(csvtext), delimiter=',', names=["depth"] + ROOTS, dtype=dtypes) print(df.to_latex) df.plot(x="depth", y=ROOTS, kind="bar", ax=ax, logy=False, width=0.8, color=["red", "green", "blue"]) ax.set_title('Negative Seed Distribution') for p in ax.patches: ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005)) plt.show()
def plot_seed_depth(ton): langdict = load_articledict() #fig, ax = plt.subplots(nrows=1, ncols=1) depthlist = [] for c in ROOTS: depthlist.append(list(map(lambda d: len([cl for cl in langdict if ((c + "Depth" in langdict[cl]) and (langdict[cl][c + "Depth"] == d)) and langdict[cl]["Seed"] == 1]) , range(ton)))) csvtext = "" for n in range(ton): csvtext += str(n) for i in range(len(ROOTS)): csvtext += ", " + str(depthlist[i][n]) csvtext += "\n" dtypes = dict() dtypes["depth"] = int for c in ROOTS: dtypes[c] = int df = read_csv(StringIO(csvtext), delimiter=',', names=["depth"] + ROOTS, header=None, dtype=dtypes, index_col=0) #df = df.set_index('depth') print(df.T.to_latex()) plt.rc('xtick', labelsize=30) plt.rc('ytick', labelsize=30) font = {'family': 'normal', 'weight': 'bold', 'size': 42} plt.rc('font', **font) ax = df.plot(y=ROOTS, kind="bar", logy=True, width=0.9, color=["red", "green", "blue"]) ax.set_title('Seed articles per depth.') #for p in ax.patches: # ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005)) plt.show()
return [l for l in seed_links if not bad_link(l)] def resolve_redirects(seed_links): resolved_tuples = [resolve_redirect(l) for l in seed_links] return [t[1] for t in resolved_tuples] def resolve_redirect(title): target = get_redirect(title) return title, target if __name__ == "__main__": # mine_summary_links() # seed_links = json.load(open(DATAP + "/seed_summary_links.json", "r")) # # seed_links = { # s[0]: [l for l in s[1] if not bad_link(l)] for s in seed_links.items() # } # # json.dump(seed_links, open(DATAP + "/seed_summary_links.json", "w+"), indent=4) ad = load_articledict() seed_links = json.load(open(DATAP + "/seed_summary_links.json", "r")) for s in seed_links: ad[s]["InternalWikiLinks"] = seed_links[s] save_articledict(ad)
from data import load_articledict, DATAP import webbrowser ad = load_articledict() f = open(DATAP + "/temp/deleted_test.txt", "w", encoding="utf-8") titles = [a for a in ad if ad[a]["Eval"]] print(len(titles)) x = 0 for title in titles: webbrowser.open("https://en.wikipedia.org/wiki/" + title, new=2) print(title) agreement = "" while agreement not in ["1", "2"]: agreement = input(str(x) + " Enter '1' or '2'! '2' for not recognized as deleted.") f.write(title + ";" + agreement) x += 1 f.flush() f.close()