예제 #1
0
def ngram_similarity(data, col1, col2):
    cos = []
    for i in range(len(data.id)):
        st = data[col1][i]
        title = data[col2][i]
        n = NGram(title.split(), key=lambda x: x[1])
        for s in st.split():
            n.search(s)

        tfidf = sktf.TfidfVectorizer().fit_transform([st, title])
        c = ((tfidf * tfidf.T).A)[0, 1]
        cos.append(c)
    return cos
예제 #2
0
def build_multiclusters(inlines, threshold=0.05, N=4):
	clusters = []
	ignoreus = []

	for i, iline in enumerate(inlines):
		if i in ignoreus:
			continue

		iString = " ".join(iline.split(" :::: ")[:3])

		ignoreus.append(i)

		icluster = {}
		icluster[iline] = -1
		iModel = NGram(iString)

		for j in range(i, len(inlines)):
			if j in ignoreus:
				continue
		
			jline = inlines[j]
			jString = " ".join(jline.split(" :::: ")[:3])
		
			results = iModel.search(jString)
			score = sum([y for x,y in results]) / len(results) \
					if len(results) > 0 else 0.0
			print score

			if score > threshold:
				icluster[jline] = score
				iModel.add(jString)
				ignoreus.append(j)

		clusters.append(icluster)
	return clusters
예제 #3
0
	def map(self,phrase):
		for term in phrase:
			if len(term) > 4:
				continue
			for word in self.corpus:
				z = Set(term) & Set(word)
				
				matches = []
				if len(z) > 0 and len(z) < len(term):
					#
					#
					g=NGram(z - Set(term))
					#matches = g.search(term)
				else:
					#
					# At this point we assume context is not informative
					# In the advent of context not being informative, we resort to fuzzy lookup
					#		
					g = NGram(word)
					#matches = g.search(term)
				g.remove(term)
				matches = g.search(term)
				key = None
				value = None					
				if len(matches) > 0:
					matches = list(matches[0])
					Pz_ = len(matches) / self.size
					Px_ = fuzz.ratio(term,matches[0]) / 100
					if Px_ > 0.5 and len(term) < len(matches[0]) and len(matches[0]) >= 4:
						key = term
						value= {}
						value= [matches[0],Pz_,Px_,1]
						self.emit (key,value)
예제 #4
0
def simtitle( request ):
    """calculate similarity based on title and naive threshold"""
    n = NGram( warp=WARP, iconv=enrich, key=lambda x: x.title )
    articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:1000]
    results = []
    for article in articles:
        article.is_duplicate = False
        article.duplicate_of = None
        article.save()
        sim = filter( lambda a: a[1] >= 0.4, n.search( article.title ) )
        for match in sim:
            nearest = match[0]
            if nearest.is_duplicate:
                nearest = nearest.duplicate_of
                if NGram.compare( article.title, nearest.title ) < 0.7:
                    results.append( article )
                    break
            article.is_duplicate = True
            article.duplicate_of = nearest
            article.save()
            break
        else:
            results.append( article )
        n.add( article )
    return render( request, "dump.html", dictionary = { "article_list": results, } )
예제 #5
0
def main(left_path, left_column, right_path, right_column,
         outfile, titles, join, minscore, count, warp):
    """Perform the similarity join"""
    right_file = csv.reader(open(right_path, 'r'))
    if titles:
        right_header = next(right_file)
    index = NGram((tuple(r) for r in right_file),
                  threshold=minscore,
                  warp=warp, key=lambda x: lowstrip(x[right_column]))
    left_file = csv.reader(open(left_path, 'r'))
    out = csv.writer(open(outfile, 'w'), lineterminator='\n')
    if titles:
        left_header = next(left_file)
        out.writerow(left_header + ["Rank", "Similarity"] + right_header)
    for row in left_file:
        if not row: continue # skip blank lines
        row = tuple(row)
        results = index.search(lowstrip(row[left_column]), threshold=minscore)
        if results:
            if count > 0:
                results = results[:count]
            for rank, result in enumerate(results, 1):
                out.writerow(row + (rank, result[1]) + result[0])
        elif join == "outer":
            out.writerow(row)
예제 #6
0
def sonucbul():
    kelimeler = list()

    v = NGram(ngramdatawords)
    sonucthreshold = list()
    sonuckelime = list()

    kelimedizisi = np.zeros((1, len(ngramdatawords)), dtype='int8')
    yorum = e1.get()  ###############
    cevirici = str.maketrans('', '', punctuation)
    yorum = yorum.translate(cevirici)
    cevirici = str.maketrans('', '', digits)
    yorum = yorum.translate(cevirici)
    yorum = yorum.lower()
    kelimeler.clear()
    kelimeler = yorum.split()
    for j in range(0, len(kelimeler), 1):
        sonucthreshold.clear()
        sonuckelime.clear()
        for ngrami in v.search(kelimeler[j], threshold=0.4):
            sonuckelime.append(str(ngrami[0]))
            sonucthreshold.append(int(ngrami[1]))
        if (len(sonuckelime) != 0):
            kelimedizisi[0][ngramdatawords.index(
                sonuckelime[sonucthreshold.index(max(sonucthreshold))])] += 1
    tmpdf = pd.DataFrame(kelimedizisi)
    sonuc = ngrammodel.predict(tmpdf)
    cevirici = str.maketrans('', '', punctuation)
    cevap = str(sonuc).translate(cevirici)
    print("Yorum= " + yorum + " Yorum Sonucu= " + str(sonuc))

    e1.delete(0, END)
    Label(master, text="Puan(1-5) =" + str(cevap)).grid(row=2)
예제 #7
0
파일: views.py 프로젝트: ntas/channelfunnel
def simtitle(request):
    """calculate similarity based on title and naive threshold"""
    n = NGram(warp=2.5, iconv=enrich)
    articles = Article.objects.filter(
        status="live").order_by("date_published")[:1000]
    results = []
    for article in articles:
        article.is_duplicate = False
        article.duplicate_of = None
        article.save()
        sim = filter(lambda a: a[1] >= 0.7, n.search(article.title))
        for match in sim:
            nearest = match[0]
            if nearest.is_duplicate:
                nearest = nearest.duplicate_of
                if NGram.compare(article.title, nearest.title) < 0.7:
                    results.append(article)
                    break
            article.is_duplicate = True
            article.duplicate_of = nearest
            article.save()
            break
        else:
            results.append(article)
        n.add(article)
    return render(request, "dump.html", dictionary={
        "article_list": results,
    })
예제 #8
0
def main(left_path, left_column, right_path, right_column, outfile, titles,
         join, minscore, count, warp):
    """Perform the similarity join"""
    right_file = csv.reader(open(right_path, 'r'))
    if titles:
        right_header = next(right_file)
    index = NGram((tuple(r) for r in right_file),
                  threshold=minscore,
                  warp=warp,
                  key=lambda x: lowstrip(x[right_column]))
    left_file = csv.reader(open(left_path, 'r'))
    out = csv.writer(open(outfile, 'w'))
    if titles:
        left_header = next(left_file)
        out.writerow(left_header + ["Rank", "Similarity"] + right_header)
    for row in left_file:
        if not row: continue  # skip blank lines
        row = tuple(row)
        results = index.search(lowstrip(row[left_column]), threshold=minscore)
        if results:
            if count > 0:
                results = results[:count]
            for rank, result in enumerate(results, 1):
                out.writerow(row + (rank, result[1]) + result[0])
        elif join == "outer":
            out.writerow(row)
예제 #9
0
    def test_ngram_search(self):
        """Tests from the original ngram.py, to check that the
        rewrite still uses the same underlying algorithm"""

        # Basic searching of the index
        idx = NGram(self.items)
        self.assertEqual(idx.search('askfjwehiuasdfji'),
                         [('askfjwehiuasdfji', 1.0),
                          ('asdfawe', 0.17391304347826086),
                          ('asfwef', 0.083333333333333329),
                          ('adfwe', 0.041666666666666664)])
        self.assertEqual(
            idx.search('afadfwe')[:2], [('adfwe', 0.59999999999999998),
                                        ('asdfawe', 0.20000000000000001)])

        # Pairwise comparison of strings
        self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0)
        self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)
예제 #10
0
 def test_set_operations(self):
     """Test advanced set operations"""
     items1 = set(["abcde", "cdefg", "fghijk", "ijklm"])
     items2 = set(["cdefg", "lmnop"])
     idx1 = NGram(items1)
     idx2 = NGram(items2)
     results = lambda L: sorted(x[0] for x in L)
     # Item removal
     self.assertEqual(results(idx1.search('cde')), ["abcde", "cdefg"])
     idx1.remove('abcde')
     self.assertEqual(results(idx1.search('cde')), ["cdefg"])
     # Set intersection operation
     items1.remove('abcde')
     idx1.intersection_update(idx2)
     self.assertEqual(idx1, items1.intersection(items2))
     self.assertEqual(results(idx1.search('lmn')), [])
     self.assertEqual(results(idx1.search('ijk')), [])
     self.assertEqual(results(idx1.search('def')), ['cdefg'])
예제 #11
0
    def test_ngram_search(self):
        """Tests from the original ngram.py, to check that the
        rewrite still uses the same underlying algorithm"""

        # Basic searching of the index
        idx = NGram(self.items)
        self.assertEqual(idx.search('askfjwehiuasdfji'), [
            ('askfjwehiuasdfji', 1.0),
            ('asdfawe', 0.17391304347826086),
            ('asfwef', 0.083333333333333329),
            ('adfwe', 0.041666666666666664)])
        self.assertEqual(idx.search('afadfwe')[:2],
                [('adfwe', 0.59999999999999998),
                 ('asdfawe', 0.20000000000000001)])

        # Pairwise comparison of strings
        self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0)
        self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)
예제 #12
0
 def test_set_operations(self):
     """Test advanced set operations"""
     items1 = set(["abcde", "cdefg", "fghijk", "ijklm"])
     items2 = set(["cdefg", "lmnop"])
     idx1 = NGram(items1)
     idx2 = NGram(items2)
     results = lambda L: sorted(x[0] for x in L)
     # Item removal
     self.assertEqual(results(idx1.search('cde')), ["abcde","cdefg"])
     idx1.remove('abcde')
     self.assertEqual(results(idx1.search('cde')), ["cdefg"])
     # Set intersection operation
     items1.remove('abcde')
     idx1.intersection_update(idx2)
     self.assertEqual(idx1, items1.intersection(items2))
     self.assertEqual(results(idx1.search('lmn')), [])
     self.assertEqual(results(idx1.search('ijk')), [])
     self.assertEqual(results(idx1.search('def')), ['cdefg'])
예제 #13
0
def main(left_path, left_column, right_path, right_column,
         outfile, titles, join, minscore, count, warp):
    """Perform the similarity join

    >>> open('left.csv', 'w').write('''ID,NAME
    ... 1,Joe
    ... 2,Kin
    ... 3,ZAS''')

    >>> open('right.csv', 'w').write('''ID,NAME
    ... ID,NAME
    ... A,Joe
    ... B,Jon
    ... C,Job
    ... D,Kim''')
    >>> main(left_path='left.csv', left_column=1,
    ... right_path='right.csv', right_column=1, outfile='out.csv',
    ... titles=True, join='outer', minscore=0.24, count=5, warp=1.0)
    >>> print open('out.csv').read()  #doctest: +NORMALIZE_WHITESPACE
    ID,NAME,Rank,Similarity,ID,NAME
    1,Joe,1,1.0,A,Joe
    1,Joe,2,0.25,B,Jon
    1,Joe,3,0.25,C,Job
    2,Kin,1,0.25,D,Kim
    3,ZAS
    <BLANKLINE>
    """
    right_file = csv.reader(open(right_path, 'r'))
    if titles:
        right_header = right_file.next()
    index = NGram((tuple(r) for r in right_file),
                  threshold=minscore,
                  warp=warp, key=lambda x: lowstrip(x[right_column]))
    left_file = csv.reader(open(left_path, 'r'))
    out = csv.writer(open(outfile, 'w'))
    if titles:
        left_header = left_file.next()
        out.writerow(left_header + ["Rank", "Similarity"] + right_header)
    for row in left_file:
        if not row: continue # skip blank lines
        row = tuple(row)
        results = index.search(lowstrip(row[left_column]), threshold=minscore)
        if results:
            if count > 0:
                results = results[:count]
            for rank, result in enumerate(results, 1):
                out.writerow(row + (rank, result[1]) + result[0])
        elif join == "outer":
            out.writerow(row)
def get_ngram_similarity(gold, candidates, N=3, strip_space=True):
    def _strip_space(s):
        if not strip_space:
            return s
        return "\n".join([part.strip(" ") for part in s.split("\n")])

    ng = NGram([_strip_space(gold)], N=N)

    sims = []

    for c in candidates:
        ng_out = ng.search(_strip_space(c))
        if len(ng_out) == 0:
            sims.append(0.0)
        else:
            sims.append(ng_out[0][1])

    return sims
예제 #15
0
    def wordsoccurrences(self, words_list, option='ortony'):
        frequencies = FreqDist(words_list)
        ordered_unigrams = frequencies.most_common()
        if option == 'ortony':
            lexicon = self.ortony_list
        else:
            lexicon = self.profane_words
        count = 0
        for t_word, count_w in ordered_unigrams:
            lower_word = t_word.lower()
            three_grams = NGram(lexicon)
            likely_words = three_grams.search(lower_word)
            if len(likely_words) > 0:
                # if lower_word in lexicon:
                count += 1 * count_w

            if lower_word in lexicon:
                count += 1
        return count
예제 #16
0
    def verify(self, text_compare):
        results = []
        texto = []
        '''
		file2 = open(text_compare,"r")
		for linea2 in file2.readlines():
			texto+=linea2.split(" ")
		tng=NGram(texto)
		file2.close()
		'''
        file2 = open(text_compare, "r")
        linea2 = file2.readline()
        while linea2 != '':
            texto += linea2.split(" ")
            linea2 = file2.readline()
        tng = NGram(texto)
        file2.close()

        for ngs in self.ng:
            count = 0
            for word in list(ngs):
                for porc in tng.search(word):
                    if porc[1] > 0.3:
                        count += 1
            results += [count]

        print list(results)

        pos = 0
        count = 0
        i = 0
        for res in results:
            if count < res:
                count = res
                pos = i
            i += 1

        if results[pos] > 2:
            print("Tema mas preciso del texto: " + repr(self.topic[pos]))
        else:
            print("No se ha podido precisar de que trata")
        print ""
예제 #17
0
	def verify(self,text_compare):
		results = []
		texto = []
		'''
		file2 = open(text_compare,"r")
		for linea2 in file2.readlines():
			texto+=linea2.split(" ")
		tng=NGram(texto)
		file2.close()
		'''
		file2 = open(text_compare,"r")
		linea2 = file2.readline()
		while linea2 != '':
			texto+=linea2.split(" ")
			linea2 = file2.readline()
		tng=NGram(texto)
		file2.close()

		for ngs in self.ng:
			count=0
			for word in list(ngs):
				for porc in tng.search(word):
					if porc[1]>0.3:
						count+=1
			results+=[count]

		print list(results)

		pos=0
		count=0
		i=0
		for res in results:
			if count<res:
				count=res
				pos=i
			i+=1

		if results[pos]>2:
			print("Tema mas preciso del texto: "+repr(self.topic[pos]))
		else:
			print("No se ha podido precisar de que trata")
		print ""			
예제 #18
0
 def _location_choices(self, search):
     ngram_index = NGram(key=self._location_to_name)
     ngram_index.update(Ward.objects.all())
     ngram_index.update(District.objects.all())
     locations = ngram_index.search(search)[:self.num_choices]
     return [self._location_to_choice(l) for l, _score in locations]
예제 #19
0
    def test_scifi_genre(self):
        index = NGram(items=['Sci-Fi'], key=lambda x: x.lower())

        self.assertGreater(index.search('science fiction')[0][1], 0)
        self.assertEqual(index.search('sci-fi')[0][1], 1)
예제 #20
0
    """
    address_longlat = []      
    for address in location:
        g = geocoder.google(address)
        list_longlat = g.latlnga
        list_longlat.insert(0,address)
        address_longlat.append(list_longlat)
    print address_longlat

    """
    get long lat from data POI using Ngram
    """

    with open("D:/tasya/python/code/Geo-Tag/corpus/sample-poi1.csv") as file:
        reader = csv.reader(file)
        #reader.next()
        corpus = []
        for row in reader:
            corpus.append(row[0])
            
    corpus_name = []
    for word in corpus:
        corpus_name.append(word.split(';')[0])
    address = []    
    G = NGram(corpus_name)
    G_latlng = NGram(corpus)
    for word in location:       
        out = G.search(word)
        out1 = G_latlng.append(out[0][0])
        address.append(out1[0][0])
예제 #21
0
	def run(self):
		N = len(self.context)
		
		imatches = []
		found = {}
		Y = range(0,len(self.bag))
		for i in range(0,N):
			Xo_ = list(self.bag[i])	# skip_gram
			#Y = (Set(range(0,N)) - (Set([i]) | Set(imatches)))
			for ii in Y:
				if self.bag[i] == self.bag[ii] :
					imatches.append(ii) ;
					continue
				#
				# We are sure we are not comparing the identical phrase
				# NOTE: Repetition doesn't yield learning, rather context does.
				# Lets determine if there are common terms
				#
				Z = Set(self.bag[i]) & Set(self.bag[ii])
				
				if len(Z) > 0 and len(Xo_) > 0:

					Xo_ 	= Set(Xo_) - Z # - list(Set(bag[i]) - Set(bag[ii]))
					Yo_ 	= Set(self.bag[ii]) - Z #list(Set(bag[ii]) - Set(bag[i]))
					size 	= len(Xo_)
					g = NGram(Yo_)	
					for term in Xo_:
						
						xo = g.search(term)
						if len(xo) > 0 and len(term) < 4:
							xo = xo[0]
						else:
							continue;
						xo = list(xo)
						xo_i = self.bag[i].index(term) 
						yo_i = self.bag[ii].index(xo[0])
						#
						# We have the pair, and we will compute the distance
						#
						ratio = fuzz.ratio(term,xo[0])/100
						is_subset = len(Set(term) & Set(xo[0])) == len(term)
						if is_subset and len(term) < len(xo[0]) and ratio > 0.5 and xo_i ==yo_i:
							
							xo[1] = [ratio,xo_i]
							if (term not in self.info):
								#xo[1] = ratio
								self.info[term] = [term,xo[0]]+xo[1]
							elif term in self.info and ratio > self.info[term][1] :							
								self.info[term] = [term,xo[0]]+xo[1]
							
							
							imatches.append(ii)
							break;
		#
		# At this point we consolidate all that has been learnt
		# And make it available to the outside word, otherwise client should retrieve it
		#
		self.lock.acquire()
		if self.queue is not None:
			
			for term in self.info:	
				value = ['thread # ',self.name]+list(self.info[term])							
				self.queue.put(value)
		self.lock.release()
예제 #22
0
    def handle(self, *args, **options):
        if "simonly" in args:
            new_count = 100000
        else:
            new_count = 0
            for source in Source.objects.filter(scraper='feedparser',
                                                status__in=('silent', 'live')):
                l = feedparser.parse(source.scraper_config)
                ok = True
                if l["bozo"] == 1:
                    if not isinstance(l["bozo_exception"],
                                      feedparser.ThingsNobodyCaresAboutButMe):
                        ok = False
                if ok:
                    for article in l["entries"]:
                        #print "Reading feed entry %s: '%s'" % ( article[ "id" ], article[ "title" ] )
                        a, created = Article.objects.get_or_create(
                            source=source,
                            # Wordpress RSS IDs are unique internet-wide, and are immutable (unlike URLs)
                            source_reference=article["id"],
                            defaults={
                                'date_created':
                                datetime.now(),
                                'source_url':
                                article["link"],
                                'title':
                                self.normalise(article["title"]),
                                'num_comments':
                                article.get("slash_comments", 0),
                                'summary':
                                article["summary"],
                                'author':
                                article.get("author", ""),
                                'date_published':
                                datetime(*(article["updated_parsed"][:6])),
                                'status':
                                "live"
                            })
                        if created:
                            #print "Creating new article."
                            pass
                        else:
                            #print "Updating article."
                            pass
                        new_count += 1
                        if article.has_key("content"):
                            # TODO test for multiple content blocks and pick most appropriate
                            a.body = article["content"][0]["value"]
                        a.tags.clear()
                        for tag in article.get("tags", ()):
                            a.tags.add(tag["term"])
                        a.save()

                else:
                    logging.error("Could not read feed for file '%s': %s" %
                                  (source.scraper_config, l["bozo_exception"]))
                    logging.error("Skipping '%s': %s" %
                                  (source.scraper_config, l["bozo_exception"]))
                    break

        #calculate similarities
        #create a similarity corpus of last 200 docs

        def enrich(obj):
            s = unicode(obj)
            # simple stop words
            s = re.sub(r"\b(the|of|in|a)\b", "", s, re.IGNORECASE)
            # type prefixes
            s = re.sub(r"^(trailer|review|report|screenshots|video):\s*", "",
                       s, re.IGNORECASE)
            return s

        n = NGram(warp=2.5, iconv=enrich)
        articles = Article.objects.filter(
            status="live").order_by("date_published")[:(new_count * 4)]
        for article in articles:
            if "simonly" in args:
                article.is_duplicate = False
                article.duplicate_of = None
                article.save()
                continue
        #articles = Article.objects.filter( status = "live", is_duplicate = False ).order_by( "-date_published" )[:new_count]
        #for article in articles:
        #print( u"similarity for %s" % ( article.title, ) )
            sim = filter(lambda a: a[1] > 0.4, n.search(article.title))
            for match in sim:
                nearest = match[0]
                if nearest.source == article.source:
                    continue
                if nearest.is_duplicate:
                    nearest = nearest.duplicate_of
                # do it again!
                if nearest.source == article.source:
                    continue
                article.is_duplicate = True
                article.duplicate_of = nearest
                #print u" is duplicate of %s" % ( nearest.title, )
                article.save()
                break
            n.add(article)
예제 #23
0
def get_similars(data, target, threshold):
    G = NGram(target)
    return G.search(data, threshold=threshold)[0][0]
예제 #24
0
    def handle( self, *args, **options ):
        if "simonly" in args:
            new_count = 100000
        else:
            new_count = 0
            for source in Source.objects.filter( scraper = 'feedparser', status__in = ( 'silent', 'live' ) ):
                l = feedparser.parse( source.scraper_config )
                ok = True
                if l[ "bozo" ] == 1:
                   if not isinstance( l[ "bozo_exception" ], feedparser.ThingsNobodyCaresAboutButMe ):
                       ok = False
                if ok:
                    for article in l[ "entries" ]:
                        #print "Reading feed entry %s: '%s'" % ( article[ "id" ], article[ "title" ] )
                        a, created = Article.objects.get_or_create(
                            source = source,
                            # Wordpress RSS IDs are unique internet-wide, and are immutable (unlike URLs)
                            source_reference = article[ "id" ],
                            defaults = {
                                'date_created' : datetime.now(),
                                'source_url' : article[ "link" ],
                                'title' : self.normalise( article[ "title" ] ),
                                'num_comments' : article.get( "slash_comments", 0 ),
                                'summary' : article[ "summary" ],
                                'author' : article.get( "author", "" ),
                                'date_published' : datetime(*(article[ "updated_parsed" ][:6])),
                                'status' : "live"
                            }
                        )
                        if created:
                            #print "Creating new article."
                            pass
                        else:
                            #print "Updating article."
                            pass
                        new_count += 1
                        if article.has_key( "content" ):
                            # TODO test for multiple content blocks and pick most appropriate
                            a.body = article[ "content" ][0][ "value" ]
                        a.tags.clear()
                        for tag in article.get( "tags", () ):
                            a.tags.add( tag[ "term" ] )
                        a.save()

                else:
                    logging.error( "Could not read feed for file '%s': %s" % ( source.scraper_config, l[ "bozo_exception" ] ) ) 
                    logging.error( "Skipping '%s': %s" % ( source.scraper_config, l[ "bozo_exception" ] ) ) 
                    break

        #calculate similarities
        #create a similarity corpus of last 200 docs

        def enrich( obj ):
            s = unicode( obj )
            # simple stop words
            s = re.sub( r"\b(the|of|in|a)\b", "", s, re.IGNORECASE )
            # type prefixes
            s = re.sub( r"^(trailer|review|report|screenshots|video):\s*", "", s, re.IGNORECASE )
            return s
        n = NGram( warp=2.5, iconv=enrich )
        articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:(new_count*4)]
        for article in articles:
            if "simonly" in args:
                article.is_duplicate = False
                article.duplicate_of = None
                article.save()
                continue
        #articles = Article.objects.filter( status = "live", is_duplicate = False ).order_by( "-date_published" )[:new_count]
        #for article in articles:
            #print( u"similarity for %s" % ( article.title, ) )
            sim = filter( lambda a: a[1] > 0.4, n.search( article.title ) )
            for match in sim:
                nearest = match[0]
                if nearest.source == article.source:
                    continue
                if nearest.is_duplicate:
                    nearest = nearest.duplicate_of
                # do it again!
                if nearest.source == article.source:
                    continue
                article.is_duplicate = True
                article.duplicate_of = nearest
                #print u" is duplicate of %s" % ( nearest.title, )
                article.save()
                break
            n.add( article )
예제 #25
0
    get long lat from geocoder
    """
    address_longlat = []
    for address in location:
        g = geocoder.google(address)
        list_longlat = g.latlnga
        list_longlat.insert(0, address)
        address_longlat.append(list_longlat)
    print address_longlat
    """
    get long lat from data POI using Ngram
    """

    with open("D:/tasya/python/code/Geo-Tag/corpus/sample-poi1.csv") as file:
        reader = csv.reader(file)
        #reader.next()
        corpus = []
        for row in reader:
            corpus.append(row[0])

    corpus_name = []
    for word in corpus:
        corpus_name.append(word.split(';')[0])
    address = []
    G = NGram(corpus_name)
    G_latlng = NGram(corpus)
    for word in location:
        out = G.search(word)
        out1 = G_latlng.append(out[0][0])
        address.append(out1[0][0])
예제 #26
0
파일: main.py 프로젝트: ARGHZ/ClassifTweets
        inst = CodigoAritm(alfabeto, probabilidades)
    except SimbProbsError as e:
        print(e)
    except ItemVacioError as e:
        print(e)
    else:
        mensajes = tuple(muestreo)

        for mensaje in mensajes:
            caracteres = NGram(mensaje.split(' '))
            try:
                print(
                    '\nEntropía de \'{0}\': {1} \nTotal de vocales: {2} \t Total de palabras: {3}'
                    .format(mensaje, str(inst.entropiadelmensaje(mensaje)),
                            contarvocales(mensaje),
                            len(word_tokenize(mensaje))))
                inst.precodmsj(mensaje + '~')
            except ExistSimbError as e:
                print('{0} \t Ignorando mensaje'.format(e))
            else:
                for palabrota in lexico:
                    minusculas = palabrota[0].lower()
                    query = caracteres.search(minusculas)
                    coincidencias = [
                        match for match in query if match[1] > 0.29
                    ]
                    if len(coincidencias) > 0:
                        print('\tBuscando >> {0}: {1}'.format(
                            minusculas, coincidencias[0]))
    finally:
        print('\nTerminando ejecución del programa...')
예제 #27
0
         #flog.write('RESULT: ' + subs[sub_idx].text + '\n')
         sub_idx += 1
         dialogue_idx += 1
         return_to_dialogue_idx = dialogue_idx
         num_fails = 0
     num_speakers_matched += 1
 # If we're not very confident in a match, find the matching scores of the
 # subtitle against each substring of the same length within the line.
 else:
     #flog.write('--CHECKING SUBSTRINGS--\n')
     num_words = len(curr_sub.split(' '))
     # Evaluate current dialogue line's substrings
     curr_line_substrings = get_all_substrings(
         num_words, num_words, curr_line)
     curr_line_ngrams = NGram(curr_line_substrings)
     curr_searches = curr_line_ngrams.search(curr_sub)
     if curr_searches:
         curr_candidate_matches, curr_candidate_scores = zip(
             *curr_searches)
         curr_max_substring_score = max(curr_candidate_scores)
         curr_line_substring_line = curr_candidate_matches[
             np.argmax(curr_candidate_scores)]
     else:
         # If no matches are returned, give a max score of 0
         curr_max_substring_score = 0.0
         curr_line_substring_line = None
     # Evaluate next dialogue line's substrings
     next_line_substrings = get_all_substrings(
         num_words, num_words, next_line)
     next_line_ngrams = NGram(next_line_substrings)
     next_searches = next_line_ngrams.search(curr_sub)