def imagenes(request): buscador = [] if request.method == 'POST': tags_to_delete = request.POST.getlist('tags') tags_to_add = request.POST.getlist('Tagx') data_id = request.POST.getlist('id') buscador = request.POST.getlist('buscador') if len(tags_to_delete) > 0: data = ScrapedImage.objects.filter(id=data_id[0]) existing_tags = separar(data[0].tags) new_tags = delete_tags_from_img(tags_to_delete, existing_tags) new_tags_str = juntar(new_tags) ScrapedImage.objects.filter(id=data_id[0]).update(tags=new_tags_str) # ADD TAGS IF if len(tags_to_add) > 0: data = ScrapedImage.objects.filter(id=data_id[0]) existing_tags = separar(data[0].tags) existing_tags.append(tags_to_add[0]) new_tags_str = juntar(existing_tags) ScrapedImage.objects.filter(id=data_id[0]).update(tags=new_tags_str) all_img = ScrapedImage.objects.all() tags = ScrapedImage.objects.values_list('tags', flat=False) keywords = [] if buscador: all_img = ScrapedImage.objects.all().filter(Q(tags__icontains=buscador[0])) tags = all_img.values_list('tags',flat=False) for tag in tags: keywords.append(separar(tag[0])) all_info = zip(all_img, keywords) return render(request, 'imagenes.html', {'all_data': all_info})
def showdata(request): buscador = [] if request.method == 'POST': if "Entrenar" in request.POST: registro_a_entrenar = request.POST.getlist('Entrenar') info = ScrapedData.objects.get(id=registro_a_entrenar[0]) info.data_entrenamiento = True info.save() update_classifier_text(info.information, info.classification) tags_to_delete = request.POST.getlist('tags') tags_to_add = request.POST.getlist('Tagx') link = request.POST.getlist('Linkx') classification_to_change = request.POST.getlist('type') data_id = request.POST.getlist('id') buscador = request.POST.getlist('buscador') # CHANGE CLASSIFICATION IF: if len(classification_to_change) > 0: ScrapedData.objects.filter(id=data_id[0]).update(classification=classification_to_change[0]) #DELETE TAGS IF if len(tags_to_delete) > 0: data = ScrapedData.objects.filter(id=data_id[0]) existing_tags = separar(data[0].tags) existing_metadata = separar(data[0].metadata) new_tags, new_metadata = delete_tags_from_data(tags_to_delete, existing_tags, existing_metadata) new_tags_str = juntar(new_tags) new_metadata_str = juntar(new_metadata) ScrapedData.objects.filter(id=data_id[0]).update(tags=new_tags_str, metadata=new_metadata_str) #ADD TAGS IF if len(tags_to_add) > 0: data = ScrapedData.objects.filter(id=data_id[0]) if len(link[0]) == 0: existing_tags_a, existing_metadata_a = add_tags_from_data(tags_to_add[0], "None", data[0].tags, data[0].metadata) else: existing_tags_a, existing_metadata_a = add_tags_from_data(tags_to_add[0], link[0], data[0].tags, data[0].metadata) ScrapedData.objects.filter(id=data_id[0]).update(tags=existing_tags_a, metadata=existing_metadata_a) all_data = ScrapedData.objects.all() tags_metadata = ScrapedData.objects.values_list('tags', 'metadata', flat=False) references = [] if buscador: all_data = ScrapedData.objects.all().filter(Q(tags__icontains=buscador[0]) | Q(classification__icontains=buscador[0])) tags_metadata = all_data.values_list('tags', 'metadata', flat=False) for tag, metadata in tags_metadata: references.append((separar(tag), separar(metadata))) all_info = zip(all_data, references) return render(request, 'showdata.html', {'all_data': all_info})
def get_info_photo(url): # credentials and creation of service vision_client = vision.Client() translate_client = translate.Client() urllib.request.urlretrieve(url, "temp.jpg") # getting tags from a certain image and delete it image = vision_client.image(filename='temp.jpg') labels = image.detect_labels() remove('temp.jpg') # processing of tags and transform to string i = 0 tags = [] for label in range(len(labels)): try: translated_label = translate_client.translate( labels[i].description, target_language='es', format_='text', source_language='en') tags.append(translated_label['translatedText']) i = i + 1 except: tags.append(labels[i].description) i = i + 1 return juntar(limpieza(tags))
def entities_example(request): form = ClassifyForm(request.POST) if form.is_valid(): sentence = form.cleaned_data['sentence'] entities, metadata = get_entities(sentence) urls = separar(get_urls(metadata)) entities_fixed = separar(juntar(limpieza(entities))) args = {'form': form, 'entities': entities_fixed, 'urls': urls} return render(request, 'entities_example.html', args) else: form = SignUpForm() args = {'form': form} return render(request, 'entities_example.html', args)
def parse(self, response): #Se guardan todos los links de imagenes Imagenes = response.xpath('//img[not(ancestor::footer) and not(ancestor::header)]/@src').extract() Videos = response.xpath('//iframe[not(ancestor::footer) and not(ancestor::header)]/@src').extract() #se guarda la url de la pagina actual url = response.url #Lista de parrafos, imagenes y listas de texto Lista_completa = response.xpath('//p[not(ancestor::footer) and not(ancestor::*[contains(@class,"nav")]) ] | //img[not(ancestor::footer) and not(ancestor::header)]/@src | //iframe[not(ancestor::footer) and not(ancestor::header)]/@src | //ul[not(ancestor::footer) and not(@class)]').extract() #Se guarda el titulo de la pagina titulo = response.xpath('string(//title)').extract()[0] Lista_videos_final = [] Lista_imagenes_final = [] Lista_informaciones_final = [] # k = 1 l = 0 leer = 1 for item in Lista_completa: if leer == 1: if item in Imagenes: link = ajuste_img_src(item,response) width , height = getsizes(link) Lista_imagenes_final.append([link,"imagen",k,titulo,url,width,height]) k = k + 1 elif item in Videos: Lista_videos_final.append([item,"video",k,titulo,url]) k = k + 1 else: soup = BeautifulSoup(item, 'html.parser') texto = soup.get_text() if not(texto == ""): if texto.endswith(":") and (Lista_completa[l + 1] not in Imagenes) and (Lista_completa[l + 1] not in Videos): soup2 = BeautifulSoup(Lista_completa[l + 1], 'html.parser') if soup2.get_text() == "": Lista_informaciones_final.append([texto,"informacion",k,titulo,url]) else: Lista_informaciones_final.append([texto + "\n" + soup2.get_text(),"informacion",k,titulo,url]) leer = 0 else: Lista_informaciones_final.append([texto,"informacion",k,titulo,url]) k = k + 1 else: leer = 1 l = l + 1 #A partir de aca se pueden utilizar o guardar los parrafos, imagenes y videos, que estan en la Listas: Lista_imagenes_final , Lista_informaciones_final # y Lista_videos_final #cada elemento de la lista de parrafos contiene: [texto, tipo de dato, orden, titulo(tema), url de la pagina] #cada elemento de la lista de imagenes contiene: [url de imagen, tipo de dato, orden, titulo(tema), url de la pagina, ancho,alto] #cada elemento de la lista de videos contiene: [url de video, tipo de dato, orden, titulo(tema), url de la pagina] for img in Lista_imagenes_final: ScrapedImageItem(order=img[2], topic=img[3], url=img[4], information=img[0], width=img[5], height=img[6], tags=get_info_photo(img[0])).save() for info in Lista_informaciones_final: print(info) classified = classify('../TextClassifier/classifier_bayes.pickle', info[0]) entities, meta = get_entities(info[0]) tag = juntar(limpieza(entities)) urls = get_urls(meta) ScrapedDataItem(order=info[2], topic=info[3], url=info[4], information=info[0], classification=classified, tags=tag, metadata=urls).save() for video in Lista_videos_final: # cada elemento de la lista de videos contiene: [url de video, tipo de dato, orden, titulo(tema), url de la pagina] ScrapedVideoItem(order=video[2], topic=video[3], url=video[4], information=video[0]).save()
def add_tags_from_data(tag_to_add, link, tags, metadata): lista = separar(tags) lista.append(tag_to_add) lista2 = separar(metadata) lista2.append('https://'+link) return juntar(lista), juntar(lista2)