Пример #1
0
def process_dupes():
    items = {}
    qids = []
    dupes = []

    # First dedupe by QID
    for item in Knead("data/reliwiki/query.csv").data():
        if item["item"] not in qids:
            reliwiki = item["reliwiki"]

            if reliwiki not in items:
                items[reliwiki] = [item]
            else:
                items[reliwiki].append(item)

            qids.append(item["item"])

    for values in items.values():
        if len(values) > 1:
            dupes = dupes + values

    Knead(dupes).write("data/reliwiki/dupes.csv",
                       fieldnames=[
                           "item", "reliwiki", "itemLabel", "itemDescription",
                           "instanceLabel", "instance"
                       ])
Пример #2
0
def find_titles():
    SPLIT_DASH = re.compile(" –|- ")

    def get_title(pageid):
        html_path = f"data/reliwiki/html/{pageid}.html"
        # print(f"Extracing title from {html_path}")

        with open(html_path) as f:
            soup = BeautifulSoup(f.read(), "lxml")

        title = soup.select_one("title").get_text().strip()
        parts = SPLIT_DASH.split(title)[1:]
        title = "- ".join(parts).replace("- Reliwiki", "").strip()
        return title

    titles = []
    for item in Knead("data/reliwiki/churches_gsheet.csv").data():
        if item["name"] != "":
            continue

        # Try to extract from the page title
        pageid = item["pageid"]
        try:
            title = get_title(pageid)
        except Exception as e:
            print(f"Could not fetch title because of {e}")
            continue

        print(f"Got '{title}'")

        titles.append({"pageid": pageid, "title": title})

    Knead(titles).write("data/reliwiki/church_extracted_titles.csv")
Пример #3
0
def process_rmm():
    items = []
    qids = {
        i["rmm"]: i["item"]
        for i in Knead("data/reliwiki/rmm-all.csv").data()
    }

    for path in iter_html():
        with open(path["path"]) as f:
            matches = list(set(RMM_ID.findall(f.read())))

        if len(matches) == 0:
            continue

        for rmm in matches:
            if rmm not in qids:
                print(f"No QID for pageid {path['id']} (RMM {rmm})")
                continue

            items.append({
                "pageid": path["id"],
                "rmm": rmm,
                "qid": qids.get(rmm, None)
            })

    Knead(items).write("data/reliwiki/rmm.csv",
                       fieldnames=["pageid", "rmm", "qid"])
Пример #4
0
def propvalue(claim):
    claim = Knead(claim)

    return {
        "id": claim.query("mainsnak/datavalue/value/id").data(),
        "property": claim.query("mainsnak/property").data()
    }
Пример #5
0
def parse_page(path):
    print(f"Parsing {path}")
    json_path = Path(path).with_suffix(".json")

    if json_path.exists():
        print(f"Got JSON file, returning that: {json_path}")
        return Knead(str(json_path)).data()

    with open(path) as f:
        soup = BeautifulSoup(f.read(), "lxml")

    # I guess the first table in mw-content-text is always the infobox
    table = soup.select_one("#mw-content-text table")

    if not table:
        print("Could not find table")
        return None

    infobox = {
        "coordinates": None,
        "pageid": Path(path).stem,
        "rijksmonument": []
    }

    for tr in table.select('tr[valign="top"]'):
        td = tr.select("td")

        if len(td) < 2:
            continue

        key = td[0].get_text().strip()
        val = td[1].get_text().strip()
        infobox[key] = val

    # Extract external references
    for a in table.select("a"):
        href = a.get("href")

        if RMM_ID.match(href):
            matches = RMM_ID.findall(href)
            infobox["rijksmonument"].append(matches[0])

    # Extract geocoordinates
    el_mapdata = soup.select_one("#map_leaflet_1 .mapdata")

    if el_mapdata:
        mapdata = json.loads(el_mapdata.get_text())

        if "locations" in mapdata and len(mapdata["locations"]) > 0:
            loc = mapdata["locations"][0]
            infobox["coordinates"] = f"{loc['lat']},{loc['lon']}"

    infobox["rijksmonument"] = ",".join(infobox["rijksmonument"])

    Knead(infobox).write(json_path)
    print(f"Written {json_path}")

    return infobox
Пример #6
0
    def save(self):
        if Path(self.out_file).suffix == ".csv":
            Knead(self.results).write(
                self.out_file,
                fieldnames = self.reconciler.FIELDNAMES
            )
        else:
            Knead(self.results).write(self.out_file)

        print(f"Written to '{self.out_file}'")
Пример #7
0
def create_csv():
    items = []
    json = Knead("data/sonneveld/kerkenkaart.json").data()

    for feature in json["features"]:
        prop = feature["properties"]
        coord = feature["geometry"]["coordinates"]
        prop["lat"] = coord[1]
        prop["lon"] = coord[0]
        items.append(prop)

    Knead(items).write("data/sonneveld/kerkenkaart.csv")
Пример #8
0
def main():
    items = Knead(PATH + "/data/uds/monuments-with-qids.csv").data()
    skiplist = Skiplist("projects/skiplists/uds.txt")

    for index, item in enumerate(items):
        print(item)
        qid = item["qid"]
        bag = item["bag_ok"]
        url = item["url"]
        print()
        print(f"#{index} / #{len(items)}")
        print(f"Handling {qid} / {bag} / {url}")

        if skiplist.has(qid):
            print(f"{qid} in skiplist, skipping")
            continue

        wd_item = WikidataItem(qid)
        claims = wd_item.get_claims()

        if Props.BAG_BUILDING in claims:
            print("This item already has a BAG building ID, skipping")
            continue

        wd_item.add_string_claim(
            Props.BAG_BUILDING,
            bag,
            references=[
                wd_item.get_item_claim(Props.STATED_IN, Items.UDS_DOC),
                wd_item.get_url_claim(Props.REF_URL, url),
                wd_item.get_item_claim(Props.LANGUAGE_WORK, Items.DUTCH)
            ])

        skiplist.add(qid)
Пример #9
0
def parse_overviews():
    items = []

    for path in OVERVIEW_PATH.glob("*.html"):
        with open(path) as f:
            soup = BeautifulSoup(f.read(), "lxml")

        print(f"Parsing {path}")

        for row in soup.select("table.list tbody tr"):
            cells = row.select("td")
            idx = cells[0].select_one("a").get("href").replace(
                "detail.jsp?id=", "")

            items.append({
                "id": idx,
                "stat_name": cells[0].select_one("a").get_text(),
                "handelsnaam": cells[1].select_one("a").get_text(),
                "plaats": cells[2].select_one("a").get_text()
            })

    parsed_path = str(DATA_PATH / "overview.csv")

    Knead(items).write(parsed_path,
                       fieldnames=["id", "stat_name", "handelsnaam", "plaats"])
Пример #10
0
def add_sites():
    PATH = str(Path(__file__).parent)
    sites = Knead(PATH + "/data/zomergasten/guest-sites.csv").data()

    for site in sites:
        qid = site["qid"]
        url = site["url"]
        name = site["guest"]

        print()
        print(f"Now handling {qid} / {name}")

        item = WikidataItem(qid)
        claims = item.get_claims()

        if Props.OFFICIAL_WEBSITE in claims:
            print("Already got a site, skip")
            continue

        item.add_url_claim(
            Props.OFFICIAL_WEBSITE,
            url,
            qualifiers=[item.get_item_claim(Props.LANGUAGE_WORK, Items.DUTCH)],
            references=[
                item.get_claim(Props.RETRIEVED, wbtime_now()),
                item.get_url_claim(
                    Props.REF_URL,
                    "https://www.vpro.nl/programmas/zomergasten/a-z.html"),
                item.get_item_claim(Props.LANGUAGE_WORK, Items.DUTCH)
            ])
Пример #11
0
    def __init__(self,
                 botid,
                 datapath,
                 key="id",
                 required_fields=[],
                 empty_check=lambda x: x == None):
        print(f"Setting up new bot '{botid}'")
        print(f"Data path: {datapath}")

        # Parse command line arguments and play it safe, assume
        # run_once and dry_run by default, except when they're
        # disabled
        args = pywikibot.handle_args()
        run_once = "-run-all" not in args
        dry_run = "-run-live" not in args
        print(f"Running once? {run_once}")
        print(f"Dry run? {dry_run}")

        self.id = botid
        self.run_once = run_once
        self.dry_run = dry_run
        self.skiplist = Skiplist(f"projects/skiplists/{self.id}.txt")
        self.key = key
        self.current_job = None
        self.data = Knead(datapath).data()
        self.required_fields = required_fields
        self.empty_check = empty_check
Пример #12
0
def dataknead_newlines():
    Knead("input/entity.json")\
        .query("entities/Q184843/sitelinks")\
        .values()\
        .map("title")\
        .filter(lambda t:t != "Blade Runner")\
        .write("output/sitelinks-other-title.csv", fieldnames=["title"])
Пример #13
0
def parse_pages(html_path):
    items = []

    for path in Path(html_path).glob("*.html"):
        print()
        print(f"Scraping {path}")

        with open(path) as f:
            soup = BeautifulSoup(f.read(), "lxml")

        url = soup.select_one('[rel="canonical"]').get("href")
        description = soup.select_one('.commons-file-information-table div.description')
        description = description.get_text().replace("Nederlands: ", "").strip()
        geolink = soup.select_one('[href*="wikimap.toolforge.org"]').get("href")
        geolink = parse_urlargs(geolink)

        items.append({
            "url" : url,
            "name" : None,
            "image" : url.replace("https://commons.wikimedia.org/wiki/", ""),
            "inscription" : description,
            "lat" : geolink["lat"],
            "lon" : geolink["lon"],
            "location" : None,
            "location_qid" : None,
            "inception" : None,
            "street" : None,
            "street_qid" : None,
            "street_nr" : None,
            "url" : None
        })

    filename = Path(html_path).stem + "-parsed.csv"
    out_path = str(Path(html_path).parent / filename)
    Knead(items).write(out_path)
Пример #14
0
def parse():
    data = []

    for path in Path(f"{BASE}/html/").glob("*.html"):
        print(f"Parsing {path}")
        kid = path.stem

        with open(path) as f:
            soup = BeautifulSoup(f, "lxml")

        year_el = soup.select_one('a[href^="jaartal"]')

        if not year_el:
            year = None
        else:
            year = year_el.get_text()
            print(year)

        # Get ownership
        owner = None

        for label in soup.select(".lbl"):
            if label.get_text() == "eigendom van:":
                owner = label.parent.select_one(".val").get_text()

        data.append({"id": kid, "year": year, "owner": owner})

    Knead(data).write(f"{BASE}/scraped-data.csv")
Пример #15
0
    def __init__(self,
                 botid,
                 datapath=None,
                 sparql=None,
                 run_once=False,
                 qid_key="qid",
                 empty_check=lambda x: x == None or x == "",
                 precheck_data=lambda x: True):
        print(f"Setting up new bot '{botid}'")

        if (not datapath) and (not sparql):
            raise Error("No datapath and no sparql")

        # Parse command line arguments and play it safe, assume
        # run_once  by default, except when they're
        # disabled
        args = pywikibot.handle_args()
        run_once = "-run-all" not in args
        print(f"Running once? {run_once}")

        self.id = botid
        self.run_once = run_once
        self.qid_key = qid_key
        self.empty_check = empty_check
        self.precheck_data = precheck_data
        self.skiplist = Skiplist(f"projects/skiplists/{self.id}.txt")

        if datapath:
            self.data = Knead(datapath, has_header=True).data()
        elif sparql:
            query = Query(sparql)
            self.data = list(query.iter_results())
Пример #16
0
def scrape():
    churches = []

    with open("data/churchseats/seats.html") as f:
        soup = BeautifulSoup(f.read(), "lxml")

    for table in soup.select(".wikitable"):
        for row in table.select("tr"):
            cells = row.select("td")

            if len(cells) < 4:
                continue

            name = cells[1]
            seats = cells[2]
            name_anchor = name.select_one("a")

            if not name_anchor:
                continue

            if not name_anchor.get("href").startswith("/wiki"):
                continue

            churches.append({
                "name": name_anchor.get("title"),
                "href": name_anchor.get("href"),
                "seats": get_number(seats.get_text()),
                "reference": get_reference(seats, soup)
            })

    Knead(churches).write("data/churchseats/seats.csv",
                          fieldnames=["name", "href", "seats", "reference"])
Пример #17
0
    def run(self):
        if self.input_format == "emlxml":
            results = self.parse_xmls()

        if self.add_percentages:
            results = self.add_percentages(results)

        Knead(results).write(self.output_path, fieldnames=self.fields)
Пример #18
0
def main():
    results = []
    file_number = 1

    for path in Path(".").glob("download_data/*.xml"):
        data = load_xml(path)
        records = data["OAI-PMH"]["ListRecords"]["record"]
        records = [parse(r) for r in records]
        results = results + records

    chunks = [
        results[i:i + BATCH_SIZE] for i in range(0, len(results), BATCH_SIZE)
    ]

    for index, chunk in enumerate(chunks):
        Knead(chunk).write(f"results-{str(index).zfill(5)}.csv")

    Knead(results).write("results.json")
Пример #19
0
def parse_all_items():
    items = []

    for path in (DATA_PATH / "html").glob("item-*.html"):
        with open(path) as f:
            item = parse_item(f.read())
            items.append(item)

    Knead(items).write(DATA_PATH / "items.json", indent=4)
Пример #20
0
def scrape_pages():
    churches = []
    api = AllPages(API_ENDPOINT)

    for page in api.iterate_pages():
        print(page["title"])
        churches.append(page)

    Knead(churches).write("data/reliwiki/pages.json")
Пример #21
0
def parse_all_overviews():
    items = []

    for path in (DATA_PATH / "html").glob("*.html"):
        with open(path) as f:
            items = items + parse_overview(f.read())

    Knead(items).write(DATA_PATH / "overview.csv",
                       fieldnames=["artist", "title", "href"])
Пример #22
0
def process_pages():
    churches = []
    for path in Path("data/reliwiki/html/").glob("*.html"):
        data = parse_page(path)

        if data:
            churches.append(data)

    Knead(churches).write("data/reliwiki/churches_data.json")
Пример #23
0
def get_pages_by_csv(csv_path):
    stem = Path(csv_path).stem
    html_path = BASE_PATH / stem
    print("Saving to: " + str(html_path))

    html_path.mkdir(exist_ok = True)

    for item in Knead(csv_path).data():
        get_page_by_id(item["pid"], html_path)
Пример #24
0
class Datasheet:
    def __init__(self, path, index):
        self.path = path
        self.data = Knead(path, has_header=True).data()
        self.keys = {i[index]: i for i in self.data}

    def __getitem__(self, key):
        if key in self.keys:
            return self.keys[key]
        else:
            return None

    def append(self, row):
        self.data.append(row)
        self.save()

    def save(self):
        print("Saving")
        Knead(self.data).write(self.path)
Пример #25
0
def transform():
    def parse(item):
        d = item["properties"]

        return {
            "title": d["KunstwerkN"],
            "creator": d["Kunstenaar"],
            "location": d["LokatieBee"],
            "lat": d["Breedtegra"].replace(",", "."),
            "lon": d["Lengtegraa"].replace(",", "."),
            "id": d["KunstwerkI"],
            "url": d["Websitever"]
        }

    k = Knead("data/kos-nijmegen/kos.json").apply(lambda f: f["features"]).map(
        parse)
    k.write(
        "data/kos-nijmegen/kos-parsed.csv",
        fieldnames=["id", "title", "creator", "location", "lat", "lon", "url"])
Пример #26
0
    def _create_lookup_table(self):
        lookup = []
        json_path = str(Path(f"{self.data_path}/*.json"))
        logging.debug(f"Getting all data files from {json_path}")

        for path in glob(json_path):
            logging.debug(f"Parsing {path}")
            item = Knead(path).data()
            qid = item["id"]
            labels = self._get_all_labels(item)
            self.qid_count += 1

            for label in labels:
                lookup.append([label, qid])

        self.label_count = len(lookup)
        logging.debug(f"Found {self.label_count} labels")
        logging.debug(f"Writing lookup table to {self.lookup_path}")
        Knead(lookup).write(self.lookup_path)
Пример #27
0
def parse_json_items():
    qid_artists = Datasheet(str(DATA_PATH / "qid-artists.csv"), "label")
    qid_collections = Datasheet(str(DATA_PATH / "qid-collections.csv"),
                                "label")
    items = []

    def get_qid(datasheet, key):
        if datasheet[key]:
            return datasheet[key]["qid"]
        else:
            return None

    for item in Knead(str(DATA_PATH / "items.json")).data():
        print(item.get("title", None))
        year = None

        if item["jaar"].isdigit():
            year = int(item["jaar"])

        collection_label = item.get("collectie", None)
        artist_label = item.get("artist", None)

        items.append({
            "inventory_nr":
            item["objectnummer"],
            "title":
            item.get("title", None),
            "year":
            year,
            "url":
            item["href"],
            "artist_label":
            artist_label,
            "artist_qid":
            get_qid(qid_artists, artist_label),
            "collection_label":
            collection_label,
            "collection_qid":
            get_qid(qid_collections, collection_label)
        })

    Knead(items).write(str(DATA_PATH / "items2.csv"))
Пример #28
0
 def __init__(self, path, data_path, lookup_path, key=None):
     logger.debug(f"Importing {path} to {data_path}")
     self.data_path = data_path
     self.key = key
     self.label_count = 0
     self.lookup_path = lookup_path
     self.path = path
     self.qid_count = 0
     self.qids = Knead(path, has_header=self.key
                       is not None).map(self._cleanup).data()
     logger.debug(f"Found {len(self.qids)} ids")
Пример #29
0
    def _create_lookup(self):
        lookup = {}

        for row in Knead(self.lookup_path).data():
            label = row[0]
            qid = row[1]

            if label in lookup:
                lookup[label].append(qid)
            else:
                lookup[label] = [qid]

        return lookup
Пример #30
0
def get_all():
    churches = []

    for index, page in enumerate(pages):
        pid = page["id"]
        qid = page.get("q", None)
        title = page["title"]
        print(f"#{index} '{title}'' ({pid} / {qid})")

        path = DATA_PATH / f"{pid}.json"
        ibox = Infobox("nl", title)

        if path.exists():
            print(f"{path} exists, using that")
            apidata = Knead(str(path), read_as="json").data()
            data = ibox.get_data(apidata)
        else:
            data = ibox.get_data()
            # Write cached data
            print(f"Writing cache data to {path}")
            Knead(ibox.apidata).write(path.resolve())

        boxes = data["infoboxes"]

        if len(boxes) == 0:
            print("No infobox")
            continue

        if len(boxes) > 1:
            print("More than one infobox, picking the first")

        churches.append({
            "pid": pid,
            "qid": qid,
            "title": title,
            "data": boxes[0].box
        })

    Knead(churches).write("parsed.json")
Пример #31
0
 def _get_by_namespace(self, tree, namespace):
     items = Knead(tree).filter(lambda i:i["@id"].startswith(namespace.uri))
     return items.data()