query = '+'.join(sys.argv[2:]) regex = re.compile("<(.*?)>|\ ") article_list = [] summary_list = [] links = GoogleNews.search(query, number_of_links) if not links: print "No links found" else: result = fetch_url.fetch_parallel(links) while not result.empty(): article = Document(result.get()).summary() article = re.sub(regex, "", article) article = article.encode('ascii', 'ignore') ss = summarize.SimpleSummarizer() summary = ss.summarize(article, 5) summary = summary.encode('ascii', 'ignore') article_list.append(article) summary_list.append(summary) """ All the outputs are written to appropriate files in this part of the code """ for i in range(0, number_of_links): f2 = open(query + str(i), 'w') f2.write(article_list[i - 1] + '\n SUMMARY OF THE ABOVE ARTICLE: \n' + summary_list[i - 1]) f2.close()
number_of_links = int(sys.argv[1]) query = '+'.join(sys.argv[2:]) regex = re.compile("<(.*?)>|\ ") article_list = [] summary_list = [] links = GoogleNews.search(query,number_of_links) #Perform Google News search if not links: print "No links found" #If no links for a query then.. else: for l in links: html = urllib2.urlopen(l).read() article = Document(html).summary() article = re.sub(regex, "", article) article = article.encode('ascii','ignore') ss = summarize.SimpleSummarizer() summary = ss.summarize(article,5) summary = summary.encode('ascii','ignore') article_list.append(article) summary_list.append(summary) """ All the outputs are written to appropriate files in this part of the code """ for i in range(1,number_of_links): f2 = open(query + str(i),'w') f2.write(article_list[i-1] + '\n' + summary_list[i-1]) f2.close()
def busqueda(user_spec, lista): resultados_busqueda = [] i = 1 j = 1 shutil.rmtree("ISW") os.makedirs("ISW", 0755) for url in search(user_spec, tld='com', lang='es', stop=160): if i <= 10: print "--------------------" # Aqui estamos extrayendo_todo lo que necesitamos del HTML html = urllib.urlopen(url).read() titulo = Document(html).short_title() articulo = Document(html).summary() print str(i) + ". " + titulo print url print " " os.makedirs("ISW/" + str(i), 0755) arch = open("ISW/" + str(i) + ".txt", "w") arch2 = open("ISW/" + str(i) + ".html", "w") arch.write(articulo.encode('utf-8')) arch2.write(articulo.encode('utf-8')) arch.close() arch2.close() # Esta es la cosita para bajar imagenes :v """ soup = BS(articulo) for imgtag in soup.find_all('img'): print(imgtag['src']) imagen_url = str ("http:" + imgtag['src']) nombre = imagen_url.rsplit('/',1)[1] print imagen_url urllib.urlretrieve(imagen_url, nombre) """ # Aqui estamos viendo cuantas cosas tiene el HTML imagenes = 0 videos = 0 texto = 0 arch = open("isw/" + str(i) + ".txt", "r") for linea in arch: if '<img' in linea: imagenes = imagenes + 1 if '<vid' in linea: videos = videos + 1 if '<p' in linea: texto = texto + 1 arch.close() print "Imagenes = " + str(imagenes) print "Videos = " + str(videos) print "Texto = " + str(texto) if (imagenes <= int(lista[0])) and (imagenes >= int( lista[1])) and (videos <= int(lista[2])) and ( videos >= int(lista[3])) and (texto <= int( lista[6])) and (texto >= int(lista[7])): print "KOWABUNGA" resultados_busqueda.append((str(j) + ". " + titulo, url)) j += 1 i = i + 1 else: break return resultados_busqueda