def save(self, *args, **kwargs): if self.duplication: self.merge() else: document = Document() document.description = self.description document.save() titleTokens = getTokensFromText(self.title) for title in titleTokens: titleToken = Token(title) titleToken.save() TitleOrder.objects.create(token=titleToken, document=document) dateTokens = getTokensFromText(self.date) for date in dateTokens: dateToken = Token(date) dateToken.save() DateOrder.objects.create(token=dateToken, document=document) locationTokens = getTokensFromText(self.location) for location in locationTokens: locationToken = Token(location) locationToken.save() LocationOrder.objects.create(token=locationToken, document=document) genreTokens = getTokensFromList(self.genres) for genre in genreTokens: genreToken = Token(genre) genreToken.save() GenresOrder.objects.create(token=genreToken, document=document) artistTokens = getTokensFromList(self.artists) for artist in artistTokens: artistToken = Token(artist) artistToken.save() ArtistOrder.objects.create(token=artistToken, document=document) tagTokens = getTokensFromList(self.tags) for tag in tagTokens: tagToken = Token(tag) tagToken.save() TagOrder.objects.create(token=tagToken, document=document) for url in self.urls: urlUrl = Url(url) urlUrl.save() UrlOrder.objects.create(url=urlUrl, document=document) for imageUrl in self.imageUrls: imageUrlUrl = Url(imageUrl) imageUrlUrl.save() ImageOrder.objects.create(url=imageUrlUrl, document=document) document.save
def merge(self): if self.duplication: if not self.duplication.description: self.duplication.description = self.description self.duplication.save() dateTokens = getTokensFromText(self.date) for date in dateTokens: dateToken = Token(date) dateToken.save() if not DateOrder.objects.filter(token=dateToken, document=self.duplication).exists(): DateOrder.objects.create(token=dateToken, document=self.duplication) locationTokens = getTokensFromText(self.location) for location in locationTokens: locationToken = Token(location) locationToken.save() if not LocationOrder.objects.filter(token=locationToken, document=self.duplication).exists(): LocationOrder.objects.create(token=locationToken, document=self.duplication) genreTokens = getTokensFromList(self.genres) for genre in genreTokens: genreToken = Token(genre) genreToken.save() if not GenresOrder.objects.filter(token=genreToken, document=self.duplication).exists(): GenresOrder.objects.create(token=genreToken, document=self.duplication) artistTokens = getTokensFromList(self.artists) for artist in artistTokens: artistToken = Token(artist) artistToken.save() if not ArtistOrder.objects.filter(token=artistToken, document=self.duplication).exists(): ArtistOrder.objects.create(token=artistToken, document=self.duplication) tagTokens = getTokensFromList(self.tags) for tag in tagTokens: tagToken = Token(tag) tagToken.save() if not TagOrder.objects.filter(token=tagToken, document=self.duplication).exists(): TagOrder.objects.create(token=tagToken, document=self.duplication) for url in self.urls: urlUrl = Url(url) urlUrl.save() if not UrlOrder.objects.filter(url=urlUrl, document=self.duplication).exists(): UrlOrder.objects.create(url=urlUrl, document=self.duplication) for imageUrl in self.imageUrls: imageUrlUrl = Url(imageUrl) imageUrlUrl.save() if not ImageOrder.objects.filter(url=imageUrlUrl, document=self.duplication).exists(): ImageOrder.objects.create(url=imageUrlUrl, document=self.duplication) self.duplication.save
def findDuplicateInResults(document, results): for result in results: for url in result.urls.all(): # If the same url -> definitly duplicate if url.name in document.urls: #print("FOUND DUPLICATE"); return result # Check for overlap in Artist, Genre, Location and Date if (hasOverlap(getTokensFromText(document.date), result.date.all()) and hasOverlap(getTokensFromText(document.location), result.location.all()) and hasOverlap(getTokensFromList(document.genres), result.genres.all()) and hasOverlap(getTokensFromList(document.artists), result.artists.all())): #print("FOUND DUPLICATE"); return result