Exemplo n.º 1
0
def update():
  paper = "chicagotribune"
  feeds = ("http://feeds.chicagotribune.com/chicagotribune/news/nationworld/",)

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Scrap articles for printable link
  updatepaper.scrape(paper, updatepaper.links, updatepaper.titles)

  if len(updatepaper.scrapefiles) == 0:
    print("No new articles found.")
    return 0

  #Get printable urls
  actualurls = []
  actualtitles = []
  beginurl = "http://www.chicagotribune.com"
  for num, file in enumerate(updatepaper.scrapefiles):
    for line in file:
      if 'articletools-print' in line:
        actualurls.append(beginurl + line.split('"')[3])
        actualtitles.append(updatepaper.scrapetitles[num])
        break

  #Download the scraped links
  updatepaper.download(paper, actualurls, actualtitles)

  #Insert the modified links into the DB
  updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
Exemplo n.º 2
0
def update():
  paper = "politico"
  feeds = ("http://www.politico.com/rss/congress.xml",
           "http://www.politico.com/rss/politics.xml")

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Scrap articles for printable link
  updatepaper.scrape(paper, updatepaper.links, updatepaper.titles)

  if len(updatepaper.scrapefiles) == 0:
    print("No new articles found.")
    return 0

  #Get printable urls
  actualurls = []
  actualtitles = []
  for num, file in enumerate(updatepaper.scrapefiles):
    for line in file:
      if 'shr-print' in line:
        actualurls.append(line.split('"')[3])
        actualtitles.append(updatepaper.scrapetitles[num])
        break

  #Download the scraped links
  updatepaper.download(paper, actualurls, actualtitles)

  #Insert the modified links into the DB
  updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
Exemplo n.º 3
0
def update():
    paper = "washingtonpost"
    feeds = (
        "http://feeds.washingtonpost.com/wp-dyn/rss/politics/administration/index_xml",
        "http://feeds.washingtonpost.com/wp-dyn/rss/politics/congress/index_xml",
        "http://feeds.washingtonpost.com/wp-dyn/rss/politics/elections/index_xml",
        "http://feeds.washingtonpost.com/wp-dyn/rss/world/index_xml",
        "http://feeds.washingtonpost.com/wp-dyn/rss/business/economy/index_xml"
    )

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds, "id")

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #printable
    actualurls = []
    for link in updatepaper.links:
        splitlink = link.split(".")
        if splitlink[3:]:
            actualurl = splitlink[0] + "." + splitlink[1] + "." + splitlink[
                2] + "_pf." + splitlink[3]
            actualurls.append(actualurl)

    #Download modified links
    updatepaper.download(paper, actualurls, updatepaper.titles)

    #Insert the modified links into the DB
    updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
Exemplo n.º 4
0
def update():
    paper = "jerusalempost"
    feeds = (
        "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463156",
        "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333498",
        "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463144",
        "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333468"
    )

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #Change links to printable
    actualurls = []
    for link in updatepaper.links:
        actualurl = link.replace("ShowFull", "Printer")
        actualurls.append(actualurl)

    #Download the links
    updatepaper.download(paper, actualurls, updatepaper.titles)

    #Insert the modified links into the DB
    updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
Exemplo n.º 5
0
def update():
    paper = "foxnews"
    feeds = ("http://feeds.foxnews.com/foxnews/world?format=xml",
             "http://feeds.foxnews.com/foxnews/politics?format=xml")

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #Modify links for printable version
    beginurl = "http://www.foxnews.com/printer_friendly_story/"
    actualurls = []
    for link in updatepaper.links:
        actualurl = beginurl + link.split('/')[-1].replace("2933", "3566")
        actualurls.append(actualurl)

    #Download the links
    updatepaper.download(paper, actualurls, updatepaper.titles)

    #Make sure failures are taken out
    actualinsertfiles = []
    actualtitles = []
    for num, outfile in enumerate(updatepaper.outfiles):
        if not "404 Not Found" in outfile and not "Page cannot be found" in outfile:
            actualinsertfiles.append(outfile)
            actualtitles.append(updatepaper.outtitles[num])

    #Insert the modified links into the DB
    updatepaper.insert(paper, actualinsertfiles, actualtitles)
Exemplo n.º 6
0
def update():
    paper = "bostonglobe"
    feeds = ("http://syndication.boston.com/news/nation?mode=rss_10",
             "http://syndication.boston.com/news/politics/?mode=rss_10",
             "http://syndication.boston.com/business/?mode=rss_10")

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #Do bostonglobe specific stuff to the links
    endurl = "?mode=PF"
    actualurls = []
    for link in updatepaper.links:
        actualurls.append(link.split('?')[0] + endurl)

    #Download the modified links
    updatepaper.download(paper, actualurls, updatepaper.titles)

    if len(updatepaper.outfiles) == 0:
        return 0

    #Strip some bad stuff out of the downloaded files
    insertfiles = []
    for file in updatepaper.outfiles:
        insertfiles.append(
            file.replace("document.location.replace(csplit);", ""))

    #Insert the modified links into the DB
    updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
Exemplo n.º 7
0
def update():
    paper = "allafrica"
    feeds = ("http://allafrica.com/tools/headlines/rdf/latest/headlines.rdf", )

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #Do allaffrica specific stuff to the links
    beginurl = "http://allafrica.com/stories/printable/"
    actualurls = []
    for link in updatepaper.links:
        splitlink = link.split("/")
        actualurl = beginurl + splitlink[-1]
        actualurls.append(actualurl)

    #Download the modified links
    updatepaper.download(paper, actualurls, updatepaper.titles)

    #Insert the modified links into the DB
    updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
Exemplo n.º 8
0
def update():
    paper = "washingtonpost"
    feeds = (
        "http://feeds.washingtonpost.com/wp-dyn/rss/politics/administration/index_xml",
        "http://feeds.washingtonpost.com/wp-dyn/rss/politics/congress/index_xml",
        "http://feeds.washingtonpost.com/wp-dyn/rss/politics/elections/index_xml",
        "http://feeds.washingtonpost.com/wp-dyn/rss/world/index_xml",
        "http://feeds.washingtonpost.com/wp-dyn/rss/business/economy/index_xml",
    )

    # Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds, "id")

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    # printable
    actualurls = []
    for link in updatepaper.links:
        splitlink = link.split(".")
        if splitlink[3:]:
            actualurl = splitlink[0] + "." + splitlink[1] + "." + splitlink[2] + "_pf." + splitlink[3]
            actualurls.append(actualurl)

    # Download modified links
    updatepaper.download(paper, actualurls, updatepaper.titles)

    # Insert the modified links into the DB
    updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
Exemplo n.º 9
0
def update():
  paper = "msnbc"
  feeds = ("http://rss.msnbc.msn.com/id/3032552/device/rss/rss.xml",
           "http://rss.msnbc.msn.com/id/3032506/device/rss/rss.xml")

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #change to printable
  dlext="/print/1/displaymode/1098"
  actualurls = []
  for link in updatepaper.links:
    actualurl = link + dlext
    actualurls.append(actualurl)

  #Download the links
  updatepaper.download(paper, actualurls, updatepaper.titles)

  #Insert the modified links into the DB
  updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
Exemplo n.º 10
0
def update():
  paper = "sydneymorningherald"
  feeds = ("http://feeds.smh.com.au/rssheadlines/top.xml",
           "http://feeds.smh.com.au/rssheadlines/national.xml",
           "http://feeds.smh.com.au/rssheadlines/world.xml")
  
  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #printable
  beginurl = "http://www.smh.com.au/cgi-bin/common/popupPrintArticle.pl?path=/articles/"
  actualurls = []
  for link in updatepaper.links:
    splitlink = link.split("/")
    actualurl = beginurl + splitlink[-4] + "/" + splitlink[-3] + "/" + splitlink[-2] + "/" + splitlink[-1]
    actualurls.append(actualurl)

  #Download the links
  updatepaper.download(paper, actualurls, updatepaper.titles)

  #Insert the modified links into the DB
  updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
Exemplo n.º 11
0
def update():
  paper = "straitstimes"
  feeds = ("http://www.straitstimes.com/STI/STIFILES/rss/break_world.xml",
           "http://www.straitstimes.com/STI/STIFILES/rss/break_money.xml",
           "http://www.straitstimes.com/STI/STIFILES/rss/break_sea.xml")

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Do allaffrica specific stuff to the links
  beginurl = "http://www.straitstimes.com/print"
  actualurls = []
  for link in updatepaper.links:
    splitlink = link.split("/")
    actualurl = beginurl + "/" + splitlink[-4] + "/" + splitlink[-3] + "/" + splitlink[-2] + "/" + splitlink[-1]
    actualurls.append(actualurl)

  #Download the modified links
  updatepaper.download(paper, actualurls, updatepaper.titles)

  #Insert the modified links into the DB
  updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
Exemplo n.º 12
0
def update():
  paper = "bostonglobe"
  feeds = ("http://syndication.boston.com/news/nation?mode=rss_10",
           "http://syndication.boston.com/news/politics/?mode=rss_10",
           "http://syndication.boston.com/business/?mode=rss_10")
  
  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Do bostonglobe specific stuff to the links
  endurl = "?mode=PF"
  actualurls = []
  for link in updatepaper.links:
    actualurls.append(link.split('?')[0] + endurl)
  
  #Download the modified links
  updatepaper.download(paper, actualurls, updatepaper.titles)

  if len(updatepaper.outfiles) == 0:
    return 0

  #Strip some bad stuff out of the downloaded files
  insertfiles = []
  for file in updatepaper.outfiles:
    insertfiles.append(file.replace("document.location.replace(csplit);", ""))

  #Insert the modified links into the DB
  updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
Exemplo n.º 13
0
def update():
  paper = "jerusalempost"
  feeds = ("http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463156",
           "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333498",
           "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1178443463144",
           "http://www.jpost.com/servlet/Satellite?pagename=JPost/Page/RSS&cid=1123495333468")
 
  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Change links to printable
  actualurls = []
  for link in updatepaper.links:
    actualurl = link.replace("ShowFull", "Printer")
    actualurls.append(actualurl)

  #Download the links
  updatepaper.download(paper, actualurls, updatepaper.titles)

  #Insert the modified links into the DB
  updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
Exemplo n.º 14
0
def update():
  paper = "foreignpolicy"
  feeds = ("http://www.foreignpolicy.com/issue/featured_content.php",
           "http://www.foreignpolicy.com/node/feed",
           "http://www.foreignpolicy.com/taxonomy/term/655/0/feed")

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Scrap articles for printable link
  updatepaper.scrape(paper, updatepaper.links, updatepaper.titles)

  if len(updatepaper.scrapefiles) == 0:
    print("No new articles found.")
    return 0

  #Get printable urls
  actualurls = []
  actualtitles = []
  total = len(updatepaper.scrapefiles)
  for num, file in enumerate(updatepaper.scrapefiles):
    for line in file:
      if '>PRINT' in line:
        actualurls.append(line.split('"')[1])
        actualtitles.append(updatepaper.scrapetitles[num])
        break

  #Download the scraped links
  updatepaper.download(paper, actualurls, actualtitles)

 #Format outputted files
  insertfiles = []
  for file in updatepaper.outfiles:
    readfile = file.split("\n")
    insertfile = ""

    for line in readfile:
      if not "window.print()" in line:
        insertfile = insertfile + "\n" + line
        
    insertfiles.append(insertfile)


  #Insert the modified links into the DB
  updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
Exemplo n.º 15
0
def update():
  paper = "foxnews"
  feeds = ("http://feeds.foxnews.com/foxnews/world?format=xml",
           "http://feeds.foxnews.com/foxnews/politics?format=xml")

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Modify links for printable version
  beginurl = "http://www.foxnews.com/printer_friendly_story/"
  actualurls = []
  for link in updatepaper.links:
    actualurl = beginurl + link.split('/')[-1].replace("2933", "3566")
    actualurls.append(actualurl)
  
  #Download the links
  updatepaper.download(paper, actualurls, updatepaper.titles)

  #Make sure failures are taken out
  actualinsertfiles = []
  actualtitles = []
  for num, outfile in enumerate(updatepaper.outfiles):
    if not "404 Not Found" in outfile and not "Page cannot be found" in outfile:
      actualinsertfiles.append(outfile)
      actualtitles.append(updatepaper.outtitles[num])

  #Insert the modified links into the DB
  updatepaper.insert(paper, actualinsertfiles, actualtitles)
Exemplo n.º 16
0
def update():
  paper = "spiegel"
  feeds = ("http://www.spiegel.de/schlagzeilen/rss/0,5291,676,00.xml",)

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #pintable
  actualurls = []
  for link in updatepaper.links:
    splitlink = link.split(",")
    actualurl = splitlink[0] + "," + splitlink[1] + ",druck-" + splitlink[2] + "," + splitlink[3]
    actualurls.append(actualurl)

  #Download the links
  updatepaper.download(paper, actualurls, updatepaper.titles)

  if len(updatepaper.outfiles) == 0:
    return 0

  #Strip some bad stuff out of the downloaded files
  insertfiles = []
  for file in updatepaper.outfiles:
    insertfiles.append(file.replace('window.print()', ""))

  #Insert the modified links into the DB
  updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
Exemplo n.º 17
0
def update():
  paper = "allafrica"
  feeds = ("http://allafrica.com/tools/headlines/rdf/latest/headlines.rdf",)

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Do allaffrica specific stuff to the links
  beginurl = "http://allafrica.com/stories/printable/"
  actualurls = []
  for link in updatepaper.links:
    splitlink = link.split("/")
    actualurl = beginurl + splitlink[-1]
    actualurls.append(actualurl)

  #Download the modified links
  updatepaper.download(paper, actualurls, updatepaper.titles)

  #Insert the modified links into the DB
  updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
Exemplo n.º 18
0
def update():
  paper = "sfchronicle"
  feeds = ("http://feeds.sfgate.com/sfgate/rss/feeds/news",
           "http://feeds.sfgate.com/sfgate/rss/feeds/business")
 
  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds, "id")

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #printable
  actualurls = []
  actualtitles = []
  beginurl = "http://www.sfgate.com/cgi-bin/article.cgi?f="
  total = len(updatepaper.links)
  for num in range(0, total):
    if "DTL" in updatepaper.links[num]:
      actualurl = beginurl + updatepaper.links[num].replace("http://feeds.sfgate.com", "") + "&type=printable"
      actualurls.append(actualurl)
      actualtitles.append(updatepaper.titles[num])

  if not actualurls:
    print("No new articles found.")
    return 0

  #Download modified links
  updatepaper.download(paper, actualurls, actualtitles)

  #Insert the modified links into the DB
  updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
Exemplo n.º 19
0
def update():
  paper = "omahaworldherald"
  feeds = ("http://www.omaha.com/apps/pbcs.dll/section?category=rss&c=news03&mime=xml",
           "http://www.omaha.com/apps/pbcs.dll/section?category=rss&c=money01&mime=xml")

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #pintable
  actualurls = []
  beginurl="http://www.omaha.com/apps/pbcs.dll/article?AID="
  for link in updatepaper.links:
    splitlink = link.split("/")
    actualurl = beginurl + "/" + splitlink[-3] + "/" + splitlink[-2] + "/" + splitlink[-1] + "&template=printart"
    actualurls.append(actualurl)

  #Download the links
  updatepaper.download(paper, actualurls, updatepaper.titles)

  if len(updatepaper.outfiles) == 0:
    return 0

  #Strip some bad stuff out of the downloaded files
  insertfiles = []
  for file in updatepaper.outfiles:
    insertfiles.append(file.replace('window.print()', ""))

  #Insert the modified links into the DB
  updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
Exemplo n.º 20
0
def update():
  paper = "economist"
  feeds = ("http://www.economist.com/rss/briefings_rss.xml",
           "http://www.economist.com/rss/europe_rss.xml",
           "http://www.economist.com/rss/united_states_rss.xml",
           "http://www.economist.com/rss/the_americas_rss.xml",
           "http://www.economist.com/rss/middle_east_and_africa_rss.xml",
           "http://www.economist.com/rss/asia_rss.xml",
           "http://www.economist.com/rss/international_rss.xml",
           "http://www.economist.com/rss/finance_and_economics_rss.xml")
   
  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Modify links for printable version
  actualurls = []
  for link in updatepaper.links:
    actualurl = link.replace("displaystory", "PrinterFriendly")
    actualurls.append(actualurl)
  
  #Download the links
  updatepaper.download(paper, actualurls, updatepaper.titles)

  #Insert the modified links into the DB
  updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
Exemplo n.º 21
0
def update():
    paper = "latimes"
    feeds = (
        "http://feeds.latimes.com/latimes/news/nationworld/nation?format=xml",
        "http://feeds.latimes.com/latimes/news/nationworld/world?format=xml",
    )

    # Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    # Scrap articles for printable link
    scrapeurls = []
    beginurl = "http://www.latimes.com/news"
    for url in updatepaper.links:
        urlparts = url.split("/")
        scrapeurl = beginurl + "/" + urlparts[6] + "/" + urlparts[7] + "/" + urlparts[-1]
        scrapeurls.append(scrapeurl)

    updatepaper.scrape(paper, scrapeurls, updatepaper.titles)

    if len(updatepaper.scrapefiles) == 0:
        print("No new articles found.")
        return 0

    # Get printable urls
    actualurls = []
    actualtitles = []
    beginurl = "http://www.latimes.com"
    for num, file in enumerate(updatepaper.scrapefiles):
        for line in file:
            if "articletools-print" in line:
                actualurls.append(beginurl + line.split('"')[3])
                actualtitles.append(updatepaper.scrapetitles[num])
                break

    # Download the scraped links
    updatepaper.download(paper, actualurls, actualtitles)

    # Insert the modified links into the DB
    updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
Exemplo n.º 22
0
def update():
  paper = "bbc"
  feeds = ("http://newsrss.bbc.co.uk/rss/newsonline_world_edition/africa/rss.xml",
           "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/americas/rss.xml",
           "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/asia-pacific/rss.xml",
           "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/europe/rss.xml",
           "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/middle_east/rss.xml",
           "http://newsrss.bbc.co.uk/rss/newsonline_world_edition/south_asia/rss.xml"
           )


  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Do bbc specific stuff to the links
  filter = ("sport", "default", "thread", "blogs", "picture_gallery", "pop_ups")
  beginurl = "http://newsvote.bbc.co.uk/mpapps/pagetools/print/news.bbc.co.uk"
  total = len(updatepaper.links)
  actualurls = []
  actualtitles = []
  for num in range(0,  total):
    link = updatepaper.links[num]
    title = updatepaper.titles[num]

    append = "yes"
    for blacklist in filter:
      if blacklist in link:
        append = "no"
        break

    if append == "yes":
      splitlink = link.split("/")
      beginnum = splitlink.index("-") + 1
      actualurl = beginurl
      for urlchunk in splitlink[beginnum:]:
        actualurl = actualurl + "/" + urlchunk
      actualurls.append(actualurl)
      actualtitles.append(title)

  #Check to see if after filter, there are any urls left
  if len(actualurls) == 0:
    print("No new articles found.")
    return 0

  #Download the modified links
  updatepaper.download(paper, actualurls, actualtitles)

  #Insert the modified links into the DB
  updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
Exemplo n.º 23
0
def update():
  paper = "londontimes"
  feeds = ("http://feeds.timesonline.co.uk/c/32313/f/440158/index.rss",
           "http://feeds.timesonline.co.uk/c/32313/f/440154/index.rss")

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Scrap articles for printable link
  updatepaper.scrape(paper, updatepaper.links, updatepaper.titles)

  if len(updatepaper.scrapefiles) == 0:
    print("No new articles found.")
    return 0

  #Change links to printable
  beginurl = "http://timesonline.co.uk"
  dlext = "?print=yes"
  actualurls = []
  actualtitles = []
  total = len(updatepaper.scrapefiles)
  for num, file in enumerate(updatepaper.scrapefiles):
    for line in file:
      if "print-comment" in line:
        actualurls.append(beginurl + line.split("'")[1] + dlext)
        actualtitles.append(updatepaper.scrapetitles[num])
        break

  #Download the links
  updatepaper.download(paper, actualurls, updatepaper.titles)

  #Insert the modified links into the DB
  updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
Exemplo n.º 24
0
def update():
    paper = "dallasmorningnews"
    feeds = (
        "http://www.dallasnews.com/newskiosk/rss/dallasnewsnationworld.xml",
        "http://www.dallasnews.com/newskiosk/rss/dallasnewsnationalpolitics.xml",
        "http://www.dallasnews.com/newskiosk/rss/dallasnewswash.xml",
        "http://www.dallasnews.com/newskiosk/rss/dallasnewsworld.xml")

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #Download the links
    updatepaper.download(paper, updatepaper.links, updatepaper.titles)

    if len(updatepaper.outfiles) == 0:
        return 0

    #Format outputted files
    insertfiles = []
    for file in updatepaper.outfiles:
        readfile = file.split("\n")
        insertfile = "<b>Dallas Morning News</b>"

        dowrite = 0
        for line in readfile:
            if dowrite == 1:
                if "<!-- vstory end -->" in line:
                    break
                else:
                    insertfile = insertfile + "\n" + line
            elif "<!-- vstory begin -->" in line:
                dowrite = 1

        insertfiles.append(insertfile)

    #Insert the modified links into the DB
    updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
Exemplo n.º 25
0
def update():
  paper = "foreignaffairs"
  feeds = ("http://www.foreignaffairs.com/rss.xml",)

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Scrap articles for printable link
  updatepaper.scrape(paper, updatepaper.links, updatepaper.titles)

  if len(updatepaper.scrapefiles) == 0:
    print("No new articles found.")
    return 0

  #Get printable urls
  actualurls = []
  actualtitles = []
  beginurl = "http://www.foreignaffairs.com"
  total = len(updatepaper.scrapefiles)
  for num, file in enumerate(updatepaper.scrapefiles):
    for line in file:
      if 'print_html' in line:
        actualurls.append(beginurl + line.split('"')[3])
        actualtitles.append(updatepaper.scrapetitles[num])
        break

  #Download the scraped links
  updatepaper.download(paper, actualurls, actualtitles)

  #Insert the modified links into the DB
  updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
Exemplo n.º 26
0
def update():
  paper = "financialtimes"
  feeds = ("http://www.ft.com/rss/world",
           "http://www.ft.com/rss/companies",
           "http://www.ft.com/rss/home/uk",
           "http://www.ft.com/rss/home/us",
           "http://www.ft.com/rss/home/europe",
           "http://www.ft.com/rss/home/asia",
           "http://www.ft.com/rss/lex")

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0
    
  #Modify links for printable version
  actualurls = []
  for link in updatepaper.links:
    actualurl = link.replace("0/", "").replace(".html", ",print=yes.html")
    actualurls.append(actualurl)

  #Download the links
  updatepaper.download(paper, actualurls, updatepaper.titles)

  if len(updatepaper.outfiles) == 0:
    return 0

  #Format outputted files
  insertfiles = []
  for file in updatepaper.outfiles:
    readfile = file.split("\n")
    insertfile = "<b>Financial Times</b>"

    for line in readfile:
      if "ft-story-header" in line or "ft-story-body" in line or "charset" in line:
        insertfile = insertfile + "\n" + line
        
    insertfile = insertfile.replace('lang=     "en">', '').replace('}// ]]></script>', '').replace('<script type="text/javascript">' , '')
    insertfiles.append(insertfile)

  #Insert the modified links into the DB
  updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
Exemplo n.º 27
0
def update():
    paper = "seattlepi"
    feeds = ("http://seattlepi.nwsource.com/rss/apafrica.rss",
             "http://seattlepi.nwsource.com/rss/apaa.rss",
             "http://seattlepi.nwsource.com/rss/apasia.rss",
             "http://seattlepi.nwsource.com/rss/apelection.rss",
             "http://seattlepi.nwsource.com/rss/apmideast.rss",
             "http://seattlepi.nwsource.com/rss/apwash.rss")

    #Get links and titles from parsing
    updatepaper = MainUpdate()
    updatepaper.parse(paper, feeds)

    if len(updatepaper.links) == 0:
        print("No new articles found.")
        return 0

    #printable version
    beginurl = "http://seattlepi.nwsource.com/printer2/index.asp?ploc=t&refer="
    actualurls = []
    for link in updatepaper.links:
        actualurl = beginurl + link
        actualurls.append(actualurl)

    #Download the links
    updatepaper.download(paper, actualurls, updatepaper.titles)

    if len(updatepaper.outfiles) == 0:
        return 0

    #Strip some bad stuff out of the downloaded files
    insertfiles = []
    for file in updatepaper.outfiles:
        insertfiles.append(file.replace('window.print()', ""))

    #Insert the modified links into the DB
    updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
Exemplo n.º 28
0
def update():
  paper = "dallasmorningnews"
  feeds = ("http://www.dallasnews.com/newskiosk/rss/dallasnewsnationworld.xml",
           "http://www.dallasnews.com/newskiosk/rss/dallasnewsnationalpolitics.xml",
           "http://www.dallasnews.com/newskiosk/rss/dallasnewswash.xml",
           "http://www.dallasnews.com/newskiosk/rss/dallasnewsworld.xml")

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Download the links
  updatepaper.download(paper, updatepaper.links, updatepaper.titles)

  if len(updatepaper.outfiles) == 0:
    return 0

  #Format outputted files
  insertfiles = []
  for file in updatepaper.outfiles:
    readfile = file.split("\n")
    insertfile = "<b>Dallas Morning News</b>"

    dowrite = 0
    for line in readfile:
      if dowrite == 1:
        if "<!-- vstory end -->" in line:
          break
        else:
          insertfile = insertfile + "\n" + line
      elif "<!-- vstory begin -->" in line:
        dowrite = 1
        
    insertfiles.append(insertfile)

  #Insert the modified links into the DB
  updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
Exemplo n.º 29
0
def update():
  paper = "csm"
  feeds = ("http://www.csmonitor.com/rss/top.rss",
           "http://rss.csmonitor.com/feeds/usa",
           "http://rss.csmonitor.com/feeds/world")

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  actualurls = []
  dlext=".htm?print=true"
  for link in updatepaper.links:
    splitlink = link.split(".html")
    actualurls.append(splitlink[0] + dlext)

  #Download the links
  updatepaper.download(paper, actualurls, updatepaper.titles)

  if len(updatepaper.outfiles) == 0:
    return 0

  #Format outputted files
  insertfiles = []
  for file in updatepaper.outfiles:
    readfile = file.split("\n")
    insertfile = ""

    for line in readfile:
      if not "window.print()" in line:
        insertfile = insertfile + "\n" + line
        
    insertfiles.append(insertfile)

  #Insert the modified links into the DB
  updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
Exemplo n.º 30
0
def update():
  paper = "feer"
  feeds = ("http://www.feer.com/rss?cat=politics",
           "http://www.feer.com/rss?cat=international-relations",
           "http://www.feer.com/rss?cat=economics")

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Download the links
  updatepaper.download(paper, updatepaper.links, updatepaper.titles)

  if len(updatepaper.outfiles) == 0:
    return 0

  #Format outputted files
  insertfiles = []
  for file in updatepaper.outfiles:
    readfile = file.split("\n")
    insertfile = "<b>Far Eastern Economic Review</b>"

    dowrite = 0
    for line in readfile:
      if dowrite == 1:
        if '<div class="content_box">' in line:
          break
        else:
          insertfile = insertfile + "\n" + line
      elif "<!-- Some content EG article -->"  in line:
        dowrite = 1
        
    insertfiles.append(insertfile)

  #Insert the modified links into the DB
  updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
Exemplo n.º 31
0
def update():
  paper = "miamiherald"
  feeds = ("http://www.miamiherald.com/884/index.xml",
           "http://www.miamiherald.com/509/index.xml",
           "http://www.miamiherald.com/578/index.xml",
           "http://www.miamiherald.com/579/index.xml",
           "http://www.miamiherald.com/581/index.xml",
           "http://www.miamiherald.com/582/index.xml")
        
  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #change to printable
  actualurls = []
  for link in updatepaper.links:
    actualurl = link.replace("/story/", "/v-print/story/")
    actualurls.append(actualurl)

  #Download the links
  updatepaper.download(paper, actualurls, updatepaper.titles)


  #Weed out the non complete articles
  insertfiles = []
  inserttitles = []
  for num, file in enumerate(updatepaper.outfiles):
    if not "Click here for full story" in file:
      insertfiles.append(file)
      inserttitles.append(updatepaper.outtitles[num])

  #Insert the modified links into the DB
  updatepaper.insert(paper, insertfiles, inserttitles)
Exemplo n.º 32
0
def update():
  paper = "seattlepi"
  feeds = ("http://seattlepi.nwsource.com/rss/apafrica.rss",
           "http://seattlepi.nwsource.com/rss/apaa.rss",
           "http://seattlepi.nwsource.com/rss/apasia.rss",
           "http://seattlepi.nwsource.com/rss/apelection.rss",
           "http://seattlepi.nwsource.com/rss/apmideast.rss",
           "http://seattlepi.nwsource.com/rss/apwash.rss")
   
  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #printable version
  beginurl = "http://seattlepi.nwsource.com/printer2/index.asp?ploc=t&refer="
  actualurls = []
  for link in updatepaper.links:
    actualurl = beginurl + link
    actualurls.append(actualurl)
    
  #Download the links
  updatepaper.download(paper, actualurls, updatepaper.titles)

  if len(updatepaper.outfiles) == 0:
    return 0

  #Strip some bad stuff out of the downloaded files
  insertfiles = []
  for file in updatepaper.outfiles:
    insertfiles.append(file.replace('window.print()', ""))

  #Insert the modified links into the DB
  updatepaper.insert(paper, insertfiles, updatepaper.outtitles)
Exemplo n.º 33
0
def update():
  paper = "londontimes"
  feeds = ("http://www.timesonline.co.uk/tol/feeds/rss/worldnews.xml",)

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Change links to printable
  dlext="?print=yes"
  actualurls = []
  for link in updatepaper.links:
    actualurl = link.split("#")[0] + dlext
    actualurls.append(actualurl)

  #Download the links
  updatepaper.download(paper, actualurls, updatepaper.titles)

  #Insert the modified links into the DB
  updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
Exemplo n.º 34
0
def update():
  paper = "japantimes"
  feeds = ("http://feeds.feedburner.com/japantimes_news",)

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Change links to printable
  beginurl = "http://search.japantimes.co.jp/print/"
  actualurls = []
  for link in updatepaper.links:
    actualurl = beginurl + link.split("/")[-1]
    actualurls.append(actualurl)

  #Download the links
  updatepaper.download(paper, actualurls, updatepaper.titles)

  #Insert the modified links into the DB
  updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)
Exemplo n.º 35
0
def update():
  paper = "ajc"
  feeds = ("http://www.ajc.com/section-rss.do?source=nation-world",
           "http://www.ajc.com/genericList-rss.do?source=94547")

  #Get links and titles from parsing
  updatepaper = MainUpdate()
  updatepaper.parse(paper, feeds)

  if len(updatepaper.links) == 0:
    print("No new articles found.")
    return 0

  #Do allaffrica specific stuff to the links
  actualurls = []
  for link in updatepaper.links:
    actualurl = link.split("?")[0] + "?printArticle=y"
    actualurls.append(actualurl)

  #Download the modified links
  updatepaper.download(paper, actualurls, updatepaper.titles)

  #Insert the modified links into the DB
  updatepaper.insert(paper, updatepaper.outfiles, updatepaper.outtitles)