示例#1
0
query = '+'.join(sys.argv[2:])
regex = re.compile("<(.*?)>|\&#13;")

article_list = []
summary_list = []

links = GoogleNews.search(query, number_of_links)

if not links:
    print "No links found"

else:
    result = fetch_url.fetch_parallel(links)

    while not result.empty():
        article = Document(result.get()).summary()
        article = re.sub(regex, "", article)
        article = article.encode('ascii', 'ignore')
        ss = summarize.SimpleSummarizer()
        summary = ss.summarize(article, 5)
        summary = summary.encode('ascii', 'ignore')
        article_list.append(article)
        summary_list.append(summary)
""" All the outputs are written to appropriate files in this part of the code """

for i in range(0, number_of_links):
    f2 = open(query + str(i), 'w')
    f2.write(article_list[i - 1] + '\n SUMMARY OF THE ABOVE ARTICLE: \n' +
             summary_list[i - 1])
    f2.close()
number_of_links = int(sys.argv[1])
query = '+'.join(sys.argv[2:])
regex = re.compile("<(.*?)>|\&#13;")

article_list = []
summary_list = []

links = GoogleNews.search(query,number_of_links)                                  #Perform Google News search

if not links:
  print "No links found"                                                          #If no links for a query then..

else:
  for l in links:
    html = urllib2.urlopen(l).read()
    article = Document(html).summary()
    article = re.sub(regex, "", article)
    article = article.encode('ascii','ignore')
    ss = summarize.SimpleSummarizer()
    summary = ss.summarize(article,5)
    summary = summary.encode('ascii','ignore')
    article_list.append(article)
    summary_list.append(summary)


  """ All the outputs are written to appropriate files in this part of the code """
for i in range(1,number_of_links):
  f2 = open(query + str(i),'w')
  f2.write(article_list[i-1] + '\n' + summary_list[i-1])
  f2.close()
示例#3
0
def busqueda(user_spec, lista):
    resultados_busqueda = []
    i = 1
    j = 1
    shutil.rmtree("ISW")
    os.makedirs("ISW", 0755)
    for url in search(user_spec, tld='com', lang='es', stop=160):
        if i <= 10:
            print "--------------------"
            # Aqui estamos extrayendo_todo lo que necesitamos del HTML
            html = urllib.urlopen(url).read()
            titulo = Document(html).short_title()
            articulo = Document(html).summary()

            print str(i) + ". " + titulo
            print url
            print " "

            os.makedirs("ISW/" + str(i), 0755)
            arch = open("ISW/" + str(i) + ".txt", "w")
            arch2 = open("ISW/" + str(i) + ".html", "w")
            arch.write(articulo.encode('utf-8'))
            arch2.write(articulo.encode('utf-8'))
            arch.close()
            arch2.close()

            # Esta es la cosita para bajar imagenes :v
            """
            soup = BS(articulo)
            for imgtag in soup.find_all('img'):
                print(imgtag['src'])
                imagen_url = str ("http:" + imgtag['src'])
                nombre = imagen_url.rsplit('/',1)[1]
                print imagen_url
                urllib.urlretrieve(imagen_url, nombre)
            """

            # Aqui estamos viendo cuantas cosas tiene el HTML
            imagenes = 0
            videos = 0
            texto = 0
            arch = open("isw/" + str(i) + ".txt", "r")
            for linea in arch:
                if '<img' in linea:
                    imagenes = imagenes + 1
                if '<vid' in linea:
                    videos = videos + 1
                if '<p' in linea:
                    texto = texto + 1
            arch.close()
            print "Imagenes = " + str(imagenes)
            print "Videos = " + str(videos)
            print "Texto = " + str(texto)

            if (imagenes <= int(lista[0])) and (imagenes >= int(
                    lista[1])) and (videos <= int(lista[2])) and (
                        videos >= int(lista[3])) and (texto <= int(
                            lista[6])) and (texto >= int(lista[7])):
                print "KOWABUNGA"
                resultados_busqueda.append((str(j) + ". " + titulo, url))
                j += 1
            i = i + 1
        else:
            break

    return resultados_busqueda