Exemplo n.º 1
0
def simtitle( request ):
    """calculate similarity based on title and naive threshold"""
    n = NGram( warp=WARP, iconv=enrich, key=lambda x: x.title )
    articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:1000]
    results = []
    for article in articles:
        article.is_duplicate = False
        article.duplicate_of = None
        article.save()
        sim = filter( lambda a: a[1] >= 0.4, n.search( article.title ) )
        for match in sim:
            nearest = match[0]
            if nearest.is_duplicate:
                nearest = nearest.duplicate_of
                if NGram.compare( article.title, nearest.title ) < 0.7:
                    results.append( article )
                    break
            article.is_duplicate = True
            article.duplicate_of = nearest
            article.save()
            break
        else:
            results.append( article )
        n.add( article )
    return render( request, "dump.html", dictionary = { "article_list": results, } )
Exemplo n.º 2
0
def build_title_index(movies, tvshows):
    start = time.time()

    entities = list(itertools.chain.from_iterable([movies, tvshows]))
    values = [entity['title'] for entity in entities]

    mapped_entities = {}
    for entity in entities:
        value = entity['title']
        if value not in mapped_entities:
            mapped_entities[value] = []

        mapped_entities[value].append(entity)

    logger.debug('Iterating title took {} ms'.format(
        int((time.time() - start) * 1000)))

    start = time.time()
    index = NGram()
    for value in values:
        index.add(value)
    logger.debug('Building title index took {} ms'.format(
        int((time.time() - start) * 1000)))

    return index, mapped_entities
Exemplo n.º 3
0
def simtitle(request):
    """calculate similarity based on title and naive threshold"""
    n = NGram(warp=2.5, iconv=enrich)
    articles = Article.objects.filter(
        status="live").order_by("date_published")[:1000]
    results = []
    for article in articles:
        article.is_duplicate = False
        article.duplicate_of = None
        article.save()
        sim = filter(lambda a: a[1] >= 0.7, n.search(article.title))
        for match in sim:
            nearest = match[0]
            if nearest.is_duplicate:
                nearest = nearest.duplicate_of
                if NGram.compare(article.title, nearest.title) < 0.7:
                    results.append(article)
                    break
            article.is_duplicate = True
            article.duplicate_of = nearest
            article.save()
            break
        else:
            results.append(article)
        n.add(article)
    return render(request, "dump.html", dictionary={
        "article_list": results,
    })
Exemplo n.º 4
0
def build_cast_index(movies, tvshows, key):
    start = time.time()

    entities = list(itertools.chain.from_iterable([movies, tvshows]))
    values = [[cast[key] for cast in entity['cast']] for entity in entities]
    values = list(set(itertools.chain.from_iterable(values)))

    mapped_entities = {}
    for entity in entities:
        for cast in entity['cast']:
            value = cast[key]
            if value not in mapped_entities:
                mapped_entities[value] = []

            mapped_entities[value].append(entity)

    logger.debug('Iterating {} took {} ms'.format(
        key, int((time.time() - start) * 1000)))

    start = time.time()
    index = NGram()
    for value in values:
        index.add(value)
    logger.debug('Building {} index took {} ms'.format(
        key, int((time.time() - start) * 1000)))

    return index, mapped_entities
Exemplo n.º 5
0
def build_collection_index(movies, tvshows):
    start = time.time()

    entities = list(itertools.chain.from_iterable([movies, tvshows]))
    values = list(
        set([
            parse_collection(entity['set']) for entity in entities
            if 'set' in entity and len(entity['set']) > 0
        ]))

    mapped_entities = {}
    for entity in entities:
        if 'set' in entity and entity['set']:
            value = parse_collection(entity['set'])
            if value not in mapped_entities:
                mapped_entities[value] = []

            mapped_entities[value].append(entity)

    logger.debug('Iterating collection took {} ms'.format(
        int((time.time() - start) * 1000)))

    start = time.time()
    index = NGram()
    for value in values:
        index.add(value)
    logger.debug('Building collection index took {} ms'.format(
        int((time.time() - start) * 1000)))

    return index, mapped_entities
Exemplo n.º 6
0
def build_multiclusters(inlines, threshold=0.05, N=4):
	clusters = []
	ignoreus = []

	for i, iline in enumerate(inlines):
		if i in ignoreus:
			continue

		iString = " ".join(iline.split(" :::: ")[:3])

		ignoreus.append(i)

		icluster = {}
		icluster[iline] = -1
		iModel = NGram(iString)

		for j in range(i, len(inlines)):
			if j in ignoreus:
				continue
		
			jline = inlines[j]
			jString = " ".join(jline.split(" :::: ")[:3])
		
			results = iModel.search(jString)
			score = sum([y for x,y in results]) / len(results) \
					if len(results) > 0 else 0.0
			print score

			if score > threshold:
				icluster[jline] = score
				iModel.add(jString)
				ignoreus.append(j)

		clusters.append(icluster)
	return clusters
Exemplo n.º 7
0
class Plagiarism:
	def __init__(self,text):
		self.ng=NGram()
		file = open(text,"r")
		linea = file.readline()
		while linea != '':
			if linea != '\n':
				self.ng.add(linea)
			linea = file.readline()
		self.lsn=list(self.ng);
		file.close()

	def verify(self,text_compare):
		results = []
		dictio = []
		file2 = open(text_compare,"r")
		linea2 = file2.readline()
		while linea2 != '':	
			if linea2 != '\n':
				dictio += [self.ng.items_sharing_ngrams(linea2)]
				compares = 0.0
				for parrafo in self.lsn:
					comp = NGram.compare(parrafo,linea2)
					if compares < comp:
						compares = comp
				results += [compares]
			linea2 = file2.readline()
		file2.close()

		major_ocurrences=[]
		for d in dictio:
			major=0
			for val in d.values():
				if major<val:
					major=val
			major_ocurrences+=[major]
			

		avg_perc=0.0
		for r in results:
			avg_perc+=r
		avg_perc=avg_perc/len(results)

		print("Mayor numero de ocurrencias por parrafo del texto copia: "+repr(major_ocurrences))
		print("Porcentaje Similitud: "+repr(avg_perc))
Exemplo n.º 8
0
    def handle( self, *args, **options ):
        if "simonly" in args:
            new_count = 100000
        else:
            new_count = 0
            for source in Source.objects.filter( scraper = 'feedparser', status__in = ( 'silent', 'live' ) ):
                l = feedparser.parse( source.scraper_config )
                ok = True
                if l[ "bozo" ] == 1:
                   if not isinstance( l[ "bozo_exception" ], feedparser.ThingsNobodyCaresAboutButMe ):
                       ok = False
                if ok:
                    for article in l[ "entries" ]:
                        #print "Reading feed entry %s: '%s'" % ( article[ "id" ], article[ "title" ] )
                        a, created = Article.objects.get_or_create(
                            source = source,
                            # Wordpress RSS IDs are unique internet-wide, and are immutable (unlike URLs)
                            source_reference = article[ "id" ],
                            defaults = {
                                'date_created' : datetime.now(),
                                'source_url' : article[ "link" ],
                                'title' : self.normalise( article[ "title" ] ),
                                'num_comments' : article.get( "slash_comments", 0 ),
                                'summary' : article[ "summary" ],
                                'author' : article.get( "author", "" ),
                                'date_published' : datetime(*(article[ "updated_parsed" ][:6])),
                                'status' : "live"
                            }
                        )
                        if created:
                            #print "Creating new article."
                            pass
                        else:
                            #print "Updating article."
                            pass
                        new_count += 1
                        if article.has_key( "content" ):
                            # TODO test for multiple content blocks and pick most appropriate
                            a.body = article[ "content" ][0][ "value" ]
                        a.tags.clear()
                        for tag in article.get( "tags", () ):
                            a.tags.add( tag[ "term" ] )
                        a.save()

                else:
                    logging.error( "Could not read feed for file '%s': %s" % ( source.scraper_config, l[ "bozo_exception" ] ) ) 
                    logging.error( "Skipping '%s': %s" % ( source.scraper_config, l[ "bozo_exception" ] ) ) 
                    break

        #calculate similarities
        #create a similarity corpus of last 200 docs

        def enrich( obj ):
            s = unicode( obj )
            # simple stop words
            s = re.sub( r"\b(the|of|in|a)\b", "", s, re.IGNORECASE )
            # type prefixes
            s = re.sub( r"^(trailer|review|report|screenshots|video):\s*", "", s, re.IGNORECASE )
            return s
        n = NGram( warp=2.5, iconv=enrich )
        articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:(new_count*4)]
        for article in articles:
            if "simonly" in args:
                article.is_duplicate = False
                article.duplicate_of = None
                article.save()
                continue
        #articles = Article.objects.filter( status = "live", is_duplicate = False ).order_by( "-date_published" )[:new_count]
        #for article in articles:
            #print( u"similarity for %s" % ( article.title, ) )
            sim = filter( lambda a: a[1] > 0.4, n.search( article.title ) )
            for match in sim:
                nearest = match[0]
                if nearest.source == article.source:
                    continue
                if nearest.is_duplicate:
                    nearest = nearest.duplicate_of
                # do it again!
                if nearest.source == article.source:
                    continue
                article.is_duplicate = True
                article.duplicate_of = nearest
                #print u" is duplicate of %s" % ( nearest.title, )
                article.save()
                break
            n.add( article )
Exemplo n.º 9
0
import csv

from ngram import NGram

records = NGram()

with open('./data/houses.csv', 'r', encoding='windows-1251') as f:
    for line in csv.reader(f, delimiter=';'):
        records.add(' '.join(list(line)).lower())

while True:
    print('Enter search text:')
    search_text = input().lower()
    print('find', records.find(search_text), 0.8)

Exemplo n.º 10
0
 def test_unigram(self):
     n = NGram(0)
     n.add('after')
     n.next_word() == 'after'
Exemplo n.º 11
0
 def test_trigram(self):
     n = NGram(2)
     n.add('after', ('before', 'other'))
     assert n.next_word(('before', 'other')) == 'after'
Exemplo n.º 12
0
 def test_bigram(self):
     n = NGram(1)
     n.add('after', ('before'))
     assert n.next_word(('before')) == 'after'
Exemplo n.º 13
0
    def handle(self, *args, **options):
        if "simonly" in args:
            new_count = 100000
        else:
            new_count = 0
            for source in Source.objects.filter(scraper='feedparser',
                                                status__in=('silent', 'live')):
                l = feedparser.parse(source.scraper_config)
                ok = True
                if l["bozo"] == 1:
                    if not isinstance(l["bozo_exception"],
                                      feedparser.ThingsNobodyCaresAboutButMe):
                        ok = False
                if ok:
                    for article in l["entries"]:
                        #print "Reading feed entry %s: '%s'" % ( article[ "id" ], article[ "title" ] )
                        a, created = Article.objects.get_or_create(
                            source=source,
                            # Wordpress RSS IDs are unique internet-wide, and are immutable (unlike URLs)
                            source_reference=article["id"],
                            defaults={
                                'date_created':
                                datetime.now(),
                                'source_url':
                                article["link"],
                                'title':
                                self.normalise(article["title"]),
                                'num_comments':
                                article.get("slash_comments", 0),
                                'summary':
                                article["summary"],
                                'author':
                                article.get("author", ""),
                                'date_published':
                                datetime(*(article["updated_parsed"][:6])),
                                'status':
                                "live"
                            })
                        if created:
                            #print "Creating new article."
                            pass
                        else:
                            #print "Updating article."
                            pass
                        new_count += 1
                        if article.has_key("content"):
                            # TODO test for multiple content blocks and pick most appropriate
                            a.body = article["content"][0]["value"]
                        a.tags.clear()
                        for tag in article.get("tags", ()):
                            a.tags.add(tag["term"])
                        a.save()

                else:
                    logging.error("Could not read feed for file '%s': %s" %
                                  (source.scraper_config, l["bozo_exception"]))
                    logging.error("Skipping '%s': %s" %
                                  (source.scraper_config, l["bozo_exception"]))
                    break

        #calculate similarities
        #create a similarity corpus of last 200 docs

        def enrich(obj):
            s = unicode(obj)
            # simple stop words
            s = re.sub(r"\b(the|of|in|a)\b", "", s, re.IGNORECASE)
            # type prefixes
            s = re.sub(r"^(trailer|review|report|screenshots|video):\s*", "",
                       s, re.IGNORECASE)
            return s

        n = NGram(warp=2.5, iconv=enrich)
        articles = Article.objects.filter(
            status="live").order_by("date_published")[:(new_count * 4)]
        for article in articles:
            if "simonly" in args:
                article.is_duplicate = False
                article.duplicate_of = None
                article.save()
                continue
        #articles = Article.objects.filter( status = "live", is_duplicate = False ).order_by( "-date_published" )[:new_count]
        #for article in articles:
        #print( u"similarity for %s" % ( article.title, ) )
            sim = filter(lambda a: a[1] > 0.4, n.search(article.title))
            for match in sim:
                nearest = match[0]
                if nearest.source == article.source:
                    continue
                if nearest.is_duplicate:
                    nearest = nearest.duplicate_of
                # do it again!
                if nearest.source == article.source:
                    continue
                article.is_duplicate = True
                article.duplicate_of = nearest
                #print u" is duplicate of %s" % ( nearest.title, )
                article.save()
                break
            n.add(article)
Exemplo n.º 14
0
 def test_unigram(self):
     n = NGram(0)
     n.add('after')
     n.next_word() == 'after'
Exemplo n.º 15
0
 def test_trigram(self):
     n = NGram(2)
     n.add('after', ('before', 'other'))
     assert n.next_word(('before', 'other')) == 'after'
Exemplo n.º 16
0
 def test_bigram(self):
     n = NGram(1)
     n.add('after', ('before'))
     assert n.next_word(('before')) == 'after'