Exemplo n.º 1
0
def getlists():
    ad = load_articledict()
    lists = [a for a in ad if "List_of" in a]

    ids = json.load(open(DUMPP + "/article_ids.json", "r"))

    linkdict = {}
    with open(DUMPP + "/pagelinks.csv", "r", encoding="utf-8") as f:
        reader = csv.reader(f, delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
        x = 0
        for row in reader:
            x += 1
            if x % 100000 == 0:
                print(x)
            from_id = row[0]
            to_title = row[1]
            if from_id not in ids:
                continue
            from_title = ids[from_id]
            if from_title in lists:
                if from_title not in linkdict:
                    linkdict[from_title] = []
                if to_title in ad:
                    linkdict[from_title].append(to_title)
    with open(DATAP + "/listlinks.json", "w", encoding="utf-8") as f:
        json.dump(linkdict, f)
Exemplo n.º 2
0
def plot_negative_seed_depth(ton):
    langdict = load_articledict()
    fig, ax = plt.subplots(nrows=1, ncols=1)
    depthlist = []
    for c in ROOTS:
        depthlist.append(list(map(lambda d: len([cl for cl in langdict
                                                 if
                                                 ((c + "Depth" in langdict[cl]) and (langdict[cl][c + "Depth"] == d))
                                                 and langdict[cl]["negativeSeed"] == 1])
                                  , range(ton))))

    csvtext = ""
    for n in range(ton):
        csvtext += str(n)
        for i in range(len(ROOTS)):
            csvtext += ", " + str(depthlist[i][n])
        csvtext += "\n"

    dtypes = dict()
    dtypes["depth"] = int
    for c in ROOTS:
        dtypes[c] = int

    df = read_csv(StringIO(csvtext), delimiter=',', names=["depth"] + ROOTS,
                  dtype=dtypes)
    print(df.to_latex)
    df.plot(x="depth", y=ROOTS, kind="bar", ax=ax, logy=False, width=0.8, color=["red", "green", "blue"])

    ax.set_title('Negative Seed Distribution')
    for p in ax.patches:
        ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
    plt.show()
Exemplo n.º 3
0
 def solo(self, backup=True):
     articledict = load_articledict()
     if backup:
         backup_articledict(articledict)
     t = start_time()
     articledict = self.check(articledict)
     stop_time(t)
     save_articledict(articledict)
Exemplo n.º 4
0
 def solo(self):
     articledict = load_articledict()
     with open(DATAP + '/catdict.json', 'r', encoding="UTF8") as f:
         cd = load(f)
         cd = self.check(cd, articledict)
     with open(DATAP + '/catdict.json', 'w', encoding="UTF8") as f:
         dump(obj=cd, fp=f, indent=2)
         f.flush()
Exemplo n.º 5
0
 def solo(self, backup=True):
     articledict = load_articledict()
     if backup:
         backup_articledict(articledict)
     t = start_time()
     articledict = self.check(articledict)
     stop_time(t)
     save_articledict(articledict)
Exemplo n.º 6
0
 def solo(self):
     articledict = load_articledict()
     with open(DATAP + '/catdict.json', 'r', encoding="UTF8") as f:
         cd = load(f)
         cd = self.check(cd, articledict)
     with open(DATAP + '/catdict.json', 'w', encoding="UTF8") as f:
         dump(obj=cd, fp=f, indent=2)
         f.flush()
Exemplo n.º 7
0
def plot_seed_depth(ton):
    langdict = load_articledict()
    #fig, ax = plt.subplots(nrows=1, ncols=1)
    depthlist = []
    for c in ROOTS:
        depthlist.append(
            list(
                map(
                    lambda d: len([
                        cl for cl in langdict
                        if ((c + "Depth" in langdict[cl]) and (langdict[cl][
                            c + "Depth"] == d)) and langdict[cl]["Seed"] == 1
                    ]), range(ton))))

    csvtext = ""
    for n in range(ton):
        csvtext += str(n)
        for i in range(len(ROOTS)):
            csvtext += ", " + str(depthlist[i][n])
        csvtext += "\n"

    dtypes = dict()
    dtypes["depth"] = int
    for c in ROOTS:
        dtypes[c] = int

    df = read_csv(StringIO(csvtext),
                  delimiter=',',
                  names=["depth"] + ROOTS,
                  header=None,
                  dtype=dtypes,
                  index_col=0)
    #df = df.set_index('depth')
    print(df.T.to_latex())
    plt.rc('xtick', labelsize=30)
    plt.rc('ytick', labelsize=30)
    font = {'family': 'normal', 'weight': 'bold', 'size': 42}
    plt.rc('font', **font)

    ax = df.plot(y=ROOTS,
                 kind="bar",
                 logy=True,
                 width=0.9,
                 color=["red", "green", "blue"])

    ax.set_title('Seed articles per depth.')

    #for p in ax.patches:
    #    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
    plt.show()
Exemplo n.º 8
0
def plot_negative_seed_depth(ton):
    langdict = load_articledict()
    fig, ax = plt.subplots(nrows=1, ncols=1)
    depthlist = []
    for c in ROOTS:
        depthlist.append(
            list(
                map(
                    lambda d: len([
                        cl for cl in langdict
                        if ((c + "Depth" in langdict[cl]) and
                            (langdict[cl][c + "Depth"] == d)) and langdict[cl][
                                "negativeSeed"] == 1
                    ]), range(ton))))

    csvtext = ""
    for n in range(ton):
        csvtext += str(n)
        for i in range(len(ROOTS)):
            csvtext += ", " + str(depthlist[i][n])
        csvtext += "\n"

    dtypes = dict()
    dtypes["depth"] = int
    for c in ROOTS:
        dtypes[c] = int

    df = read_csv(StringIO(csvtext),
                  delimiter=',',
                  names=["depth"] + ROOTS,
                  dtype=dtypes)
    print(df.to_latex)
    df.plot(x="depth",
            y=ROOTS,
            kind="bar",
            ax=ax,
            logy=False,
            width=0.8,
            color=["red", "green", "blue"])

    ax.set_title('Negative Seed Distribution')
    for p in ax.patches:
        ax.annotate(str(p.get_height()),
                    (p.get_x() * 1.005, p.get_height() * 1.005))
    plt.show()
Exemplo n.º 9
0
def plot_seed_depth(ton):
    langdict = load_articledict()
    #fig, ax = plt.subplots(nrows=1, ncols=1)
    depthlist = []
    for c in ROOTS:
        depthlist.append(list(map(lambda d: len([cl for cl in langdict
                                                 if
                                                 ((c + "Depth" in langdict[cl]) and (langdict[cl][c + "Depth"] == d))
                                                 and langdict[cl]["Seed"] == 1])
                                  , range(ton))))

    csvtext = ""
    for n in range(ton):
        csvtext += str(n)
        for i in range(len(ROOTS)):
            csvtext += ", " + str(depthlist[i][n])
        csvtext += "\n"

    dtypes = dict()
    dtypes["depth"] = int
    for c in ROOTS:
        dtypes[c] = int

    df = read_csv(StringIO(csvtext), delimiter=',', names=["depth"] + ROOTS, header=None,
                  dtype=dtypes, index_col=0)
    #df = df.set_index('depth')
    print(df.T.to_latex())
    plt.rc('xtick', labelsize=30)
    plt.rc('ytick', labelsize=30)
    font = {'family': 'normal',
            'weight': 'bold',
            'size': 42}
    plt.rc('font', **font)

    ax = df.plot(y=ROOTS, kind="bar", logy=True, width=0.9, color=["red", "green", "blue"])

    ax.set_title('Seed articles per depth.')

    #for p in ax.patches:
    #    ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
    plt.show()
Exemplo n.º 10
0
    return [l for l in seed_links if not bad_link(l)]


def resolve_redirects(seed_links):
    resolved_tuples = [resolve_redirect(l) for l in seed_links]
    return [t[1] for t in resolved_tuples]


def resolve_redirect(title):
    target = get_redirect(title)
    return title, target


if __name__ == "__main__":
    # mine_summary_links()
    # seed_links = json.load(open(DATAP + "/seed_summary_links.json", "r"))
    #
    # seed_links = {
    #     s[0]: [l for l in s[1] if not bad_link(l)] for s in seed_links.items()
    # }
    #
    # json.dump(seed_links, open(DATAP + "/seed_summary_links.json", "w+"), indent=4)

    ad = load_articledict()
    seed_links = json.load(open(DATAP + "/seed_summary_links.json", "r"))

    for s in seed_links:
        ad[s]["InternalWikiLinks"] = seed_links[s]

    save_articledict(ad)
Exemplo n.º 11
0
from data import load_articledict, DATAP
import webbrowser

ad = load_articledict()
f = open(DATAP + "/temp/deleted_test.txt", "w", encoding="utf-8")

titles = [a for a in ad if ad[a]["Eval"]]
print(len(titles))
x = 0
for title in titles:
    webbrowser.open("https://en.wikipedia.org/wiki/" + title, new=2)
    print(title)
    agreement = ""
    while agreement not in ["1", "2"]:
        agreement = input(str(x) + " Enter '1' or '2'! '2' for not recognized as deleted.")
    f.write(title + ";" + agreement)
    x += 1
f.flush()
f.close()