""" Four way of join of 'google android' on yahoo news, summize, youtube, and digg Combine results based on titles having an overlap of 3 terms or more Group results based on yahoo news title (remove duplicates) Redefined the group by equality operator to use text.norm to do near duplicate text removal In the group sum the digg and youtube favorite counts as the rank for each joined result Sort by rank, print to stdout """ __author__ = "BOSS Team" from util import console, text from yos.yql import db from yos.boss import ysearch ynews_data = ysearch.search_v1("google android", vertical="news", count=100, more={"news.ranking": "date"}) ynews = db.create(name="ynews", data=ynews_data) ynews.rename(before="headline", after="title") sm = db.create(name="sm", url="http://summize.com/search.json?q=google+android&rpp=60&lang=en") sm.rename(before="text", after="title") ytf = lambda r: {"title": r["title"]["value"], "favorites": int(r["statistics"]["favoriteCount"])} yt = db.select(name="yt", udf=ytf, url="http://gdata.youtube.com/feeds/api/videos?vq=google+android&lr=en&orderby=published") diggf = lambda r: {"title": r["title"]["value"], "diggs": int(r["diggCount"]["value"])} digg = db.select(name="dg", udf=diggf, url="http://digg.com/rss_search?search=google+android&area=dig&type=both§ion=news") def overlap_predicate(r1, r2): return text.overlap(r1["title"], r2["title"]) > 2
# See accompanying LICENSE file or http://www.opensource.org/licenses/BSD-3-Clause for the specific language governing permissions and limitations under the License. """ Search yahoo news and twitter for facebook Combine results with techmeme feeds based on titles having at least 2 term overlap Print results to stdout """ __author__ = "BOSS Team" from util import console, text from yos.yql import db, udfs from yos.boss import ysearch gn = db.create(name="gn", data=ysearch.search_v1("facebook", vertical="news", count=40)) gn.rename("headline", "title") sm = db.create(name="sm", url="http://search.twitter.com/search.json?q=facebook&rpp=40") sm.rename("text", "title") tm = db.select(name="tm", udf=udfs.unnest_value, url="http://techmeme.com/firehose.xml") def overlap(r1, r2): return text.overlap(r1["title"], r2["title"]) > 1 j = db.join(overlap, [gn, sm, tm]) j = db.sort(key="sm$id", table=j) for r in j.rows: console.write( "\n%s\n[yahoo] %s\n[twitter] %s\n[techmeme] %s\n" % (r["sm$created_at"], r["gn$title"], r["sm$title"], r["tm$title"]) )
""" Search 'iphone' on yahoo news and sort by date Get the wikipedia edits for the iphone page Rank the news results based on their title/text overlap with the wikipedia entries Sort by the overlap sizes This could potentially be a new freshness model, based on the idea that wikipedia is updated for recent significance """ __author__ = "BOSS Team" from util import console, text from yos.boss import ysearch from yos.yql import db yn = db.create(name="yn", data=ysearch.search_v1("iphone sdk", vertical="news", count=50, more={"news.ranking": "date"})) wiki = db.create(name="wiki", url="http://en.wikipedia.org/w/index.php?title=IPhone_OS&feed=atom&action=history") tb = db.cross([yn, wiki]) def rankf(row): row.update( {"rank": text.overlap(row["yn$abstract"], row["wiki$summary"]["value"])} ) ; return row tb = db.select(udf=rankf, table=tb) tb = db.group(by=["yn$title"], key="rank", reducer=lambda d1,d2: d1+d2, as="total", table=tb, norm=text.norm) tb = db.sort(key="total", table=tb) print "Before\n" for r in yn.rows: console.write( "[news] %s\n" % r["yn$title"] )
""" Search 'google android' on yahoo news, summize, and digg Join results based on titles having an overlap of 3 terms or more Group duplicates based on yahoo news title In the group by sum by diggs, save as field rank Then sort by rank and print to stdout """ __author__ = "BOSS Team" from util import console, text from yos.yql import db from yos.boss import ysearch ynews_data = ysearch.search_v1("google android", vertical="news", count=60) ynews = db.create(name="ynews", data=ynews_data) ynews.rename(before="headline", after="title") sm = db.create(name="sm", url="http://summize.com/search.json?q=google+android&rpp=60&lang=en") sm.rename(before="text", after="title") titlef = lambda r: {"title": r["title"]["value"], "diggs": int(r["diggCount"]["value"])} digg = db.select(name="dg", udf=titlef, url="http://digg.com/rss_search?search=google+android&area=dig&type=both§ion=news") def overlap_predicate(r1, r2): return text.overlap(r1["title"], r2["title"]) > 2 tb = db.join(overlap_predicate, [ynews, sm, digg]) tb = db.group(by=["ynews$title"], key="dg$diggs", reducer=lambda d1, d2: d1 + d2, as="rank", table=tb, norm=text.norm) tb = db.sort(key="rank", table=tb)
""" Inner join popular delicious results and yahoo news results for the query 'iphone' Combine results which have at least 2 terms in common in their titles Then publish as a search results html page using the provided california template """ __author__ = "BOSS Team" from templates import publisher from util import text, console from yos.boss.ysearch import search_v1 from yos.yql import db, udfs dl = db.select(name="dl", udf=udfs.unnest_value, url="http://feeds.delicious.com/rss/popular/iphone") dl.describe() yn = db.create(name="yn", data=search_v1("iphone", vertical="news", count=50)) def overlap_predicate(r1, r2): return text.overlap(r1["title"], r2["title"]) > 1 serp = publisher.Serp(template_dir="templates/california", title="boss 'iphone'", endpoint="http://yahoo/search") tb = db.join(overlap_predicate, [dl, yn]) tb = db.group(by=["yn$title"], key=None, reducer=lambda x,y: None, as=None, table=tb, norm=text.norm) for row in tb.rows: serp.add(url=row["dl$link"], title=row["yn$title"], abstract=row["yn$abstract"], dispurl=row["yn$sourceurl"], source=row["dl$creator"]) serp.dump("iphone.html")