Python parse_URL 예제들, uplib.webutils.parse_URL Python 예제들

예제 #1

0

파일 보기

파일: RSSReader.py 프로젝트: project-renard-survey/xerox-parc-uplib-mirror

def process_entry (entry):
    """Return a URL and metadata.txt file drawn from the elements in this entry.

    :param: entry the FeedParser entry
    :type: entry a dictionary of metadata about the entry
    :return: a dictionary of UpLib metadata about the entry.  The "original-url" metadata field is guaranteed.
    :rtype: dict
    """
    d = {}
    from uplib.plibUtil import note
    from uplib.webutils import parse_URL
    if "link" not in entry:
        return None
    link = entry.get("origlink") or entry.get("link")
    # some elementary ad filtering
    host, port, path = parse_URL(link)
    if host.startswith("ads."):
        return None
    d["original-url"] = link
    if entry.has_key("title"):
        d["title"] = HTMLENTITIES.sub(deescape_html, entry.get("title"))
    if entry.has_key("summary"):
        summary = HTMLENTITIES.sub(deescape_html, entry.get("summary"))
        summary = re.sub(r"\s", " ", summary)
        if '<' in summary:
            summary = summary[:summary.index('<')]
        d["abstract"] = summary
        d["summary"] = summary
    author = None
    if entry.has_key("author_detail") and entry.get("author_detail").has_key("name"):
        author = entry.get("author_detail").get("name")
    elif entry.has_key("author"):
        author = entry.author
    if author:
        # ny times does bylines strangely
        if host.endswith("nytimes.com"):
            if author.startswith("By "):
                author = author[3:]
            # capitalize properly
            author = author.title()
            # lowercase "And"
            author = author.replace(" And ", " and ")
        d["authors"] = author
    if entry.has_key("updated_parsed"):
        date = entry.updated_parsed
    elif entry.has_key("published_parsed"):
        date = entry.published_parsed
    elif entry.has_key("created_parsed"):
        date = entry.created_parsed
    else:
        date = None
    if date:
        d["date"] = "%s/%s/%s" % (date[1], date[2], date[0])
        d["rss-timestamp"] = str(int(time.mktime(date)))
    d["rss-id"] = entry.get("id") or entry.get("guid") or entry.get("link")
    return d

예제 #2

0

파일 보기

파일: NYTimes.py 프로젝트: project-renard-survey/xerox-parc-uplib-mirror

    def rip (self, folder, docid):

        def encodestring(s):
            # nytimes strings have xml char refs, and we want Unicode
            if not s:
                return s

            s = re.sub(r"&#([0-9]+);", lambda x: unichr(int(x.group(1))), s)
            s = re.sub(r"&([a-z]+);", lambda x: htmlentitydefs.name2codepoint(x.group(1)), s)
            return s

        mdpath = os.path.join(folder, "metadata.txt")
        originalspath = os.path.join(folder, "originals", "original.html")
        if not (os.path.exists(mdpath) and os.path.exists(originalspath)):
            return
        md = read_metadata(mdpath)
        url = md.get("original-url")
        if not url:
            return
        host, port, path = parse_URL(url)
        if host != "www.nytimes.com":
            return

        # OK, it's from the NY Times
        new_metadata = MetadataGatherer.parse(originalspath)

        if "source" not in md:
            md["source"] = "New York Times"

        # not all articles have metadata...
        if not ((('title' in new_metadata) or ('hdl' in new_metadata)) and ('pdate' in new_metadata)):
            note(3, "No metadata in article:  %s", new_metadata)
            return

        md["title"] = encodestring(new_metadata.get("hdl") or md.get("title"))
        if "date" not in md:
            # get the date
            d = new_metadata.get("pdate")
            md["date"] = "%s/%s/%s" % (d[4:6], d[6:], d[:4])
        if "authors" not in md:
            # get the byline
            d = new_metadata.get("byl")
            if d:
                if d.startswith("By "):
                    d = d[3:]
                # capitalize properly
                d = d.title()
                # lowercase "And"
                d = d.replace(" And ", " and ")
                md["authors"] = encodestring(d)
        d = new_metadata.get("keywords")
        d0 = md.get("keywords")
        if d0:
            d0 += ("," + d)
        else:
            d0 = d
        if d0:
            md["keywords"] = encodestring(d0)
        if new_metadata.get("description"):
            md["summary"] = encodestring(new_metadata.get("description"))
        update_metadata(mdpath, md)

예제 #3

0

파일 보기

파일: WashPost.py 프로젝트: project-renard-survey/xerox-parc-uplib-mirror

    def rip(self, folder, docid):
        def encodestring(s):
            # WashPost strings have xml char refs, and we want Unicode
            if not s:
                return s

            s = re.sub(r"&#([0-9]+);", lambda x: unichr(int(x.group(1))), s)
            s = re.sub(r"&([a-z]+);", lambda x: htmlentitydefs.name2codepoint(x.group(1)), s)
            return s

        def dequote(s):
            return re.sub(r"\\'", "'", s)

        def catclean(s):
            return re.sub(r"[/,]", "_", s)

        mdpath = os.path.join(folder, "metadata.txt")
        originalspath = os.path.join(folder, "originals", "original.html")
        if not (os.path.exists(mdpath) and os.path.exists(originalspath)):
            return
        md = read_metadata(mdpath)
        url = md.get("original-url")
        if not url:
            return
        host, port, path = parse_URL(url)
        if host != "www.washingtonpost.com":
            return

        # OK, it's from the Post
        new_metadata = MetadataGatherer.parse(originalspath)
        for line in open(originalspath):
            if line.startswith(_HEADLINE):
                line = line[len(_HEADLINE) :].strip("\n")
                t = _TITLEPATTERN.match(line)
                if t:
                    new_metadata["hdl"] = dequote(t.group("title"))
            m = _AUTHORSPATTERN.search(line)
            if m:
                new_metadata["authors"] = dequote(line[len(m.group(0)) :].strip(" ';\n"))
            if line.startswith(_CONTENTID):
                new_metadata["content-id"] = line[len(_CONTENTID) :].strip(" ';\n")
            if line.startswith(_SECTION):
                section = line[len(_SECTION) :].strip(" ';\n")
                i = section.index("'")
                new_metadata["section"] = section[:i]

        if "source" not in md:
            md["source"] = "Washington Post"

        # not all articles have metadata...
        if not ("hdl" in new_metadata):
            note(3, "No metadata in article:  %s", new_metadata)
            return

        md["title"] = encodestring(new_metadata.get("hdl") or md.get("title"))

        if "date" not in md:
            # get the date
            d = _URLDATEPATTERN.match(url)
            if d:
                md["date"] = "%s/%s/%s" % (d.group("month"), d.group("day"), d.group("year"))

        if "authors" not in md:
            # get the byline
            d = new_metadata.get("authors")
            if d:
                md["authors"] = encodestring(d)

        d = new_metadata.get("keywords")
        d0 = md.get("keywords")
        if d and d0:
            d0 = [x.strip() for x in d0.split(",")] + [x.strip() for x in d.split(";")]
        elif d:
            d0 = [x.strip() for x in d.split(";")]
        if d0:
            md["keywords"] = encodestring(",".join(d0))
        if new_metadata.get("description"):
            md["summary"] = encodestring(new_metadata.get("description"))
            md["abstract"] = encodestring(new_metadata.get("description"))
        section = new_metadata.get("section")
        if section:
            c = md.get("categories")
            if c:
                c = [x.strip() for x in c.split(",")]
            else:
                c = []
            c = c + ["article", "Washington Post/%s" % catclean(section)]
            md["categories"] = ",".join(c)
        content_id = new_metadata.get("content-id")
        if content_id:
            md["citation"] = "Washington Post article %s" % content_id
        update_metadata(mdpath, md)