예제 #1
0
def _page_img_links(page_title, min_size, max_size):
    """Return all image links from a Wikipedia page.

    Args:
        page_title: A string containing the page title.
        min_size: An integer describing the minimum dimensions that an image
            must have (both height and width) in order to be returned.
        max_size: The maximum size allowed to be returned by the function.

    Returns:
        A tuple of the image links as a list and the image (max) sizes.
    """
    import xml.etree.ElementTree as ET
    import wiki

    data = wiki.get_wiki_json(page_title)
    tree = ET.fromstring(data['text']['*'])
    req_keys = set(['alt', 'src', 'width', 'height'])

    img_links = []
    sizes = []
    for img in tree.findall(".//img"):
        attributes = set(img.attrib.keys())
        if len(attributes.intersection(req_keys)) == 4:
            width = int(img.attrib['width'])
            height = int(img.attrib['height'])
            size = max(height, width)
            if min_size <= size <= max_size:
                if img.attrib['src'][-3:] in ['jpg', 'png']:
                    img_links.append("https://" + img.attrib['src'][2:])
                    sizes.append(max(height, width))

    return img_links, sizes
예제 #2
0
def _wiki_page_revisions(page_title):
    import requests
    base_api_url = 'https://' + 'en' + '.wikipedia.org/w/api.php?'

    page_json = wiki.get_wiki_json(page_title)
    pageid = page_json['pageid']
    revid = page_json['revid']

    api_query = base_api_url + \
        "action=query&" + "format=json&" + \
        "prop=revisions&" + "rvprop=ids|size|timestamp|comment|user&" \
        "rvlimit=max&" + \
        "pageids={0:d}&".format(pageid) + \
        "rvstartid={0:d}&".format(revid)
    req = requests.get(api_query)
    page_data = req.json()

    rev_data = page_data['query']['pages'][str(pageid)]['revisions']

    while 'continue' in page_data:
        rvcontinue = page_data['continue']['rvcontinue']
        api_query_continue = api_query + \
            "rvcontinue={0:s}&".format(rvcontinue)
        req = requests.get(api_query_continue)
        page_data = req.json()
        rev_data += page_data['query']['pages'][str(pageid)]['revisions']
        msg = "Loaded {0:d} revisions, through {1:s}"
        print(msg.format(len(rev_data), rev_data[-1]['timestamp']))

    return rev_data
예제 #3
0
def link_to_geo(link):
    """Extract geolocation information from a Wikipedia page.

    Args:
        link: A string giving the link to a Wikipedia page.
    Results:
        Either None, if no results found, or a tuple giving the
        (lat, lon) from the page's metadata.
    """
    data = wiki.get_wiki_json(link)
    tree = ET.fromstring(data['text']['*'])

    geo = tree.find(".//span[@class='geo']")
    if geo is None:
        return None

    result = re.split(';', geo.text)
    if len(result) == 2:
        try:
            lat = float(result[0])
            lon = float(result[1])
        except ValueError:
            return None

    return lat, lon
예제 #4
0
def link_to_section(link):
    """Extract internal Wikipedia links by section.

    Args:
        link: A string giving the link to a Wikipedia page.
    Results:
        A list of dictionaries, one for each heading.
    """
    data = wiki.get_wiki_json(link)
    tree = ET.fromstring(data['text']['*'])

    output = []
    temp = []
    heading = "Header"
    for child in tree:
        if child.tag == "p":
            temp.append("".join(list(child.itertext())))
        elif child.tag == "h2":
            if temp:
                output.append(
                    dict(heading=heading, text=clean_text("".join(temp))))
            temp = []
            heading = child.find('.//span[@class="mw-headline"]')
            if heading is not None:
                heading = heading.text
            else:
                heading = ""

    if temp:
        output.append(dict(heading=heading, text=clean_text("".join(temp))))

    return output
예제 #5
0
def _compute_meta_dataframe(links):
    """Convert links to a pandas DataFrame object
    """
    import pandas as pd
    from wiki import get_wiki_json

    meta = dict(link=[], title=[], doc=[], num_sections=[],
                num_images=[], num_ilinks=[], num_elinks=[],
                num_langs=[], langs=[], ilinks=[])
    for link in links:
        data = get_wiki_json(link)
        tree = ET.fromstring(data['text']['*'])

        meta['link'].append(re.sub(' ', '_', data['title']))
        meta['title'].append(re.sub('<[^>]+>', '', data['displaytitle']))
        meta['doc'].append(_tree_to_doc(tree))
        meta['num_sections'].append(len(data['sections']))
        meta['num_images'].append(len(data['images']))
        meta['num_ilinks'].append(len(data['links']))
        meta['num_elinks'].append(len(data['externallinks']))
        meta['num_langs'].append(len(data['langlinks']))
        meta['langs'].append([x['lang'] for x in data['langlinks']])
        meta['ilinks'].append([re.sub(' ', '_', x['*']) for x in
                               data['links'] if x['ns'] == 0])
        lat, lon = _tree_to_geo(tree)
        meta['lat'] = lat
        meta['lon'] = lon

    pdf = pd.DataFrame(meta).drop_duplicates(subset='link', keep="first")
    return pdf.reset_index()
예제 #6
0
def _compute_meta_dataframe(links):
    """Convert links to a pandas DataFrame object
    """
    import pandas as pd
    from wiki import get_wiki_json

    meta = dict(link=[],
                title=[],
                doc=[],
                first_p=[],
                num_sections=[],
                num_images=[],
                num_ilinks=[],
                num_elinks=[],
                num_langs=[],
                langs=[],
                ilinks=[],
                first_img=[])
    for link in links:
        data = get_wiki_json(link)
        tree = ET.fromstring(data['text']['*'])

        meta['link'].append(re.sub(' ', '_', data['title']))
        meta['title'].append(re.sub('<[^>]+>', '', data['displaytitle']))
        next_doc, next_first_p = tree_to_doc(tree)
        meta['doc'].append(next_doc)
        meta['first_p'].append(next_first_p)
        meta['num_sections'].append(len(data['sections']))
        meta['num_images'].append(len(data['images']))
        meta['num_ilinks'].append(len(data['links']))
        meta['num_elinks'].append(len(data['externallinks']))
        meta['num_langs'].append(len(data['langlinks']))
        meta['langs'].append([(x['lang'], x['url'])
                              for x in data['langlinks']])
        meta['ilinks'].append(
            [re.sub(' ', '_', x['*']) for x in data['links'] if x['ns'] == 0])
        lat, lon = _tree_to_geo(tree)
        meta['lat'] = lat
        meta['lon'] = lon

        # add first image to the dataset
        first_img = ''
        for item in tree.findall('.//img'):
            if int(item.attrib.get('width', 0)) >= 150:
                first_img = 'https:' + item.attrib['src']
                break

        meta['first_img'].append(first_img)

    meta['eigen'] = _compute_centrality(links, meta)

    pdf = pd.DataFrame(meta)
    return pdf.reset_index()
예제 #7
0
파일: system.py 프로젝트: sudodoki/prj-nlp
def get_data(labels=None):
    init_dir(train_dir)
    init_dir(test_dir)
    if labels is None:
        with (data_dir / db_filename).open() as f:
            labels = json.load(f)
    data = []
    for key, value in _zip(TRAIN, "train") + _zip(TEST, "test"):
        output = get_wiki_json(WIKI_MAP[key])
        output["y_true"] = labels[key]
        output["type"] = value
        output["title"] = key
        data.append(output)
        with (data_dir / value / f"{key}.json").open("w+") as f:
            json.dump(output, f, indent=4)
    with (data_dir / 'train_test.json').open("w+") as f:
        json.dump(data, f)
    return data
예제 #8
0
def link_to_p(link):
    """Returns each paragraph in Wikipedia page as a string.

    Args:
        link: A string giving the link to a Wikipedia page.
    Results:
        A list of non-empty strings.
    """
    data = wiki.get_wiki_json(link)
    tree = ET.fromstring(data['text']['*'])

    output = []
    for child in tree.findall('.//p'):
        text = "".join(list(child.itertext()))
        text = clean_text(text)
        if text:
            output.append(text)

    return output
예제 #9
0
def link_to_lilinks(link):
    """Extract internal Wikipedia links from paragraphs.

    Args:
        link: A string giving the link to a Wikipedia page.
    Results:
        A list of unique internal links.
    """
    data = wiki.get_wiki_json(link)
    tree = ET.fromstring(data['text']['*'])

    output = []
    for ilink in tree.findall(".//li/a"):
        if 'href' in ilink.attrib:
            href = ilink.attrib['href']
            if href[:6] == "/wiki/":
                output.append(href[6:])

    ilinks = [re.sub(' ', '_', x) for x in wiki.links_as_list(data)]
    output = list(set(output).intersection(ilinks))
    return sorted(output)
예제 #10
0
def get_internal_links(data):
    """Extract internal Wikipedia links.

    Args:
        data: Either a string describing the name of a Wikipedia page or a
              dictionary object already pulled with `wiki.get_wiki_json`.
    Returns:
        A dictionary with three elements: 'ilinks' (all of the internal links
        for the page), 'ilinks_p' (links from the page found inside paragraph
        tags), and 'ilinks_li' (links found inside list items). All links are
        checked to make sure they actually exist.
    """
    from wiki import get_wiki_json

    if isinstance(data, str):
        data = get_wiki_json(data)

    ilinks = [x['*'] for x in data['links'] if x['ns'] == 0 and 'exists' in x]
    ilinks = [re.sub(' ', '_', x) for x in ilinks]
    tree = ET.fromstring(data['text']['*'])

    output_p = []
    for child in tree:
        if child.tag == "p":
            for ilink in child.findall(".//a"):
                href = ilink.attrib['href']
                if href[:6] == "/wiki/":
                    output_p.append(href[6:])

    output_li = []
    for ilink in tree.findall(".//li/a"):
        if 'href' in ilink.attrib:
            href = ilink.attrib['href']
            if href[:6] == "/wiki/":
                output_li.append(href[6:])

    output_p = sorted(list(set(output_p).intersection(ilinks)))
    output_li = sorted(list(set(output_li).intersection(ilinks)))

    return dict(ilinks=ilinks, ilinks_p=output_p, ilinks_li=output_li)
예제 #11
0
create_zip_file(rr_links, 'birthday-cake')

# richmond, va
data_json = wiki.download_wiki_json("Richmond,_Virginia")
rr_links = wiki.links_as_list(data_json)
create_zip_file(rr_links, 'richmond-va')

# philosophy
links_us = wikitext.get_internal_links(
    'List_of_important_publications_in_philosophy')
create_zip_file(links_us['ilinks'], 'philosophy')

# impressionists
page_links = wikitext.get_internal_links("Impressionism")['ilinks'] + [
    "Impressionism"
]
create_zip_file(page_links, 'impressionists-text')

# novelists and poets
data = wiki.get_wiki_json("List_of_American_novelists")
data_html = data['text']['*']
authors = re.findall('<li><a href="/wiki/([^"]+)"', data_html)
nov_authors = authors[:(authors.index('Leane_Zugsmith') + 1)]

data = wiki.get_wiki_json("List_of_poets_from_the_United_States")
data_html = data['text']['*']
authors = re.findall('<li><a href="/wiki/([^"]+)"', data_html)
poe_authors = authors[:(authors.index('Louis_Zukofsky') + 1)]

create_zip_file(nov_authors + poe_authors, "novel-poem")