Exemplo n.º 1
0
def cluster(url):
    """
    Read URLs from sitemaps and return clusters
    url is either a website (and we detect sitemaps) or a sitemap
    """
    data = {}
    if url[:4] != "http":
        url = "http://" + url

    if re.search(r"https?://[^/?#]+[/?#].+", url):
        sitemaps = [url]  # sitemap URL given
    else:
        robots = url.strip("/") + "/robots.txt"
        sitemaps = sitemaps_from_robots(robots)
        if not sitemaps:
            # assume sitemap.xml
            sitemaps = [url.strip("/") + "/sitemap.xml"]

    if sitemaps:
        try:
            urls = read_sitemaps(sitemaps)
            if not urls:
                data["error"] = "No URLs found in sitemap"
            else:
                data["count"] = len(urls)
                urls = [x.strip() for x in urls]
                # cluster URLs
                c = urlclustering.cluster(urls)
                tmp = deepcopy(c["clusters"])
                try:
                    improve_patterns(c["clusters"])
                except:
                    c["clusters"] = tmp
                    pass
                # prepare HTML
                html = "<pre>CLUSTERS:"
                keys = sorted(c["clusters"], key=lambda k: len(c["clusters"][k]), reverse=True)
                for key in keys:
                    urls = c["clusters"][key]
                    html += "\n" + key[1] + " [%s URLs]<br/>" % len(urls)
                    html += "\t" + "\n\t".join(urls[:5])
                    html += "\n\t...%s more" % (len(urls) - 5)
                html += "\n\nUNCLUSTERED:\n"
                html += "\t" + "\n\t".join(c["unclustered"])
                html += "</pre>"
                data["html"] = html
        except:
            logging.debug(traceback.format_exc())
            data["error"] = "An error happened while fetching sitemaps"
    else:
        data["error"] = "No sitemaps found"

    return json.dumps(data)
Exemplo n.º 2
0
def cluster_urls(
        urls: Iterable[str],
        min_cluster_size: int = 10) -> pd.DataFrame:
    """
    Cluster URLs by regex rules defined in this package:
    https://pypi.org/project/urlclustering/

    urls: list
        List of urls.
    min_clustre_size: int
        Minimum cluster size
    """
    import urlclustering
    clusters = urlclustering.cluster(urls, min_cluster_size)
    tmp = {v0: [k[1], 0] for k, v in clusters['clusters'].items() for v0 in v}
    tmp.update({k: [k, 1] for k in clusters['unclustered']})
    clusters = pd.DataFrame.from_dict(
        tmp, orient='index', columns=['cluster', 'unclustered'])
    return clusters
Exemplo n.º 3
0
#!/usr/bin/env python
from __future__ import print_function
import random
import urlclustering


def pprint(clusters):
    for key, urls in clusters.items():
        print('REGEX:', key[0])
        print('HUMAN:', key[1])
        print('URLS:')
        print('\t' + '\n\t'.join(urls) + '\n')

urls = [
    u'http://example.com',
    u'http://example.com/about',
]
cats = [u'http://example.com/cat/%s' % x
        for x in ('sports', 'tech', 'life', 'politics', 'world')]
tags = [u'http://example.com/tag/%s/tag%s' % (random.randint(100, 999), x)
        for x in range(10)]
arts = [u'http://example.com/article/?id=%s' % x for x in range(10)]

c = urlclustering.cluster(urls + cats + tags + arts, 5)

pprint(c['clusters'])
print('UNCLUSTERED:')
print('\t' + '\n\t'.join(c['unclustered']))
Exemplo n.º 4
0
import urlclustering


def pprint(clusters):
    for key, urls in clusters.items():
        print('REGEX:', key[0])
        print('HUMAN:', key[1])
        print('URLS:')
        print('\t' + '\n\t'.join(urls) + '\n')


urls = [
    u'http://example.com',
    u'http://example.com/about',
]
cats = [
    u'http://example.com/cat/%s' % x
    for x in ('sports', 'tech', 'life', 'politics', 'world')
]
tags = [
    u'http://example.com/tag/%s/tag%s' % (random.randint(100, 999), x)
    for x in range(10)
]
arts = [u'http://example.com/article/?id=%s' % x for x in range(10)]

c = urlclustering.cluster(urls + cats + tags + arts, 5)

pprint(c['clusters'])
print('UNCLUSTERED:')
print('\t' + '\n\t'.join(c['unclustered']))