Exemplo n.º 1
0
    def search(self, query, verbose=0):
        """Searches files satisfying query

        It first decompose the query in ngrams, then score each document containing
        at least one ngram with the number. The ten document having the most ngrams
        in common with the query are selected.
        
        Args:
             query (str): what to search;
             results_number (int): number of results to return (default: 10)
        """
        if verbose > 0:
            print("searching " + query)
        query = query.lower()
        qgram = ng(query, self.slb)
        qocument = set()
        for q in qgram:
            if q in self.ngrams.keys():
                for i in self.ngrams[q]:
                    qocument.add(i)
        self.qocument = qocument
        results = {}
        for i in qocument:
            for j in self.D[i].keys():
                if not j in results.keys():
                    results[j] = 0
                results[j] = results[j] + self.D[i][j]
        sorted_results = sorted(results.items(),
                                key=operator.itemgetter(1),
                                reverse=True)
        return [self.elements[f[0]] for f in sorted_results]
Exemplo n.º 2
0
 def __init__(self, type_instance, slb=4, verbose=0):
     self.slb = slb
     self.elements = type_instance.elements
     self.documents = type_instance.documents
     self.terms = list(set([t for e in self.documents for t in e]))
     self.d = len(self.terms)
     self.N = len(self.documents)
     self.D = {}
     if verbose >= 2:
         print("indexing documents")
     for i, t in enumerate(self.terms):
         for j, e in enumerate(self.documents):
             occs = len([w for w in e if w == t])
             if occs != 0:
                 if not i in self.D.keys():
                     self.D[i] = {}
                 self.D[i][j] = occs
     if verbose >= 2:
         print("creating ngrams")
     self.ngrams = {}
     for i, t in enumerate(self.terms):
         if len(t) >= slb:
             for g in ng(t, slb):
                 if not g in self.ngrams.keys():
                     self.ngrams[g] = []
                 self.ngrams[g].append(i)
Exemplo n.º 3
0
    def getNgrams(self, words, gram):

        if len(words) >= gram:
            ngrams = [g for g in ng(words, gram)]
            ngramsFound = set([
                '#'.join(ngram) for ngram in ngrams if (None not in ngram)
            ]).intersection(self.wordList[gram])
            return ngramsFound

        return []
Exemplo n.º 4
0
    def closest_parts(self,
                      search_item_1,
                      search_item_2,
                      cosine=True,
                      close_words_n=4,
                      one_side=True):
        """returns a list of the  word combinations of at most close_words_n in search_item_2.tokens
            that is closest to the sum_vector of given search_item_1
            """

        if one_side:
            ngrams = []
            for n in range(close_words_n):
                ngrams += list(ng(search_item_2.used_tokens, n + 1))
            #            for token in search_item.used_tokens:
            #                ngrams+=tuple(token)
            print(ngrams)
            weights = []
            return_dict = dict()

            for ngram in ngrams:
                sum_vector = np.zeros(self.word_vectors.vector_size)
                for token in ngram:
                    if token in search_item_2.used_tokens:
                        count_token = search_item_2.used_tokens.count(token)
                        sum_vector = sum_vector + np.multiply(
                            text_tools.tfidf(token, search_item_2.lower_tokens,
                                             self.t_l_l) *
                            self.word_vectors.wv[token], 1 / count_token)
                weights.append(sum_vector)

                return_dict.update({
                    ngram:
                    text_tools.similarity_vec(sum_vector,
                                              search_item_1.sum_vec, cosine)
                })

            sd = list(sorted(return_dict.items(), key=operator.itemgetter(1)))
            for ppp in sd:
                print(ppp)
            if cosine:
                sd = sd[::-1]
            return sd[0]
        else:
            return self.closest_parts(search_item_1,
                                      search_item_2,
                                      cosine,
                                      close_words_n,
                                      one_side=True), self.closest_parts(
                                          search_item_2,
                                          search_item_1,
                                          cosine,
                                          close_words_n,
                                          one_side=True)
Exemplo n.º 5
0
def ngrams(file_name='Adje.json', language='nl', n=1, probabilities=True):
    file = os.getcwd() + '/../Lyrics/' + '_' + language + '/' + file_name
    file = open(file, 'r', encoding='utf-8')
    data = json.load(file)
    nlp = spacy.load('nl_core_news_sm')
    ngram = {}
    for song in data['songs']:
        lyrics = data["songs"][song]['lyrics']
        for sentence in lyrics.split('\n'):
            sentence = sentence
            sentence = nlp(sentence)
            ng(sentence)
            for token in sentence:
                token = token.text.lower()
                if token in ngram:
                    ngram[token] = ngram.get(token) + 1
                else:
                    ngram[token] = 1
    print(ngram)
    print(len(ngram))
Exemplo n.º 6
0
def getNgrams(text: str, grams):
    splits = []

    # Here, the text is split into sentences first. Second, the sentences are tokenized. Lastly, the sentences are
    # stripped of multiple whitespaces
    for s in re.split('[^\w\s]|[\n\r]+', text):
        tokens = [
            t.strip() for t in re.split('[^\u0900-\u097F]+', s) if t.strip()
        ]
        if len(tokens) >= grams:
            splits += ['#'.join(g) for g in ng(tokens, grams)]

    emoticons = getEmoticons(text)

    tokens = splits + emoticons

    return tokens
			return getFrequenciesList

	def getTotalCount(self):
		"""
			getTotalCount() returns a floating point number of the total number of tokens in this textual data set.
		"""
		return sum([value * 1.0 for value in self.getCounts().values()])

if __name__ == "__main__":
	import os, sys, codecs

	dataSource = "/Users/ducrix/Documents/Research/Python/data/ceb2.txt"
	#data = "this is a test text sample. It should contain a good good good amount of words because it has to be use for a test. I am going to try to use as full of words as possible possible possible."
	data = open(dataSource, 'r').read()
	#data = codecs.open(dataSource, 'r', 'utf8').read()

	e = Estimator(text = data)
	freq = e.getFrequencies(returnType = 1)
	#print "count()\t\t\t\t", e.count("")
	#print "getCounts()\t\t\t", e.getCounts(returnType = 1)
	#print "frequency()\t\t\t", e.frequency("to")
	#print "getFrequencies()\t", len(freq)
	#print "getTotalCount()\t\t", e.getTotalCount()
	print "getNgramsCount()\t\t\t", e.getNgramsCount('a')
	print ng("this is a test", 3)
	#print "getNgramsFrequencies()\t", e.getNgramsFrequencies()
	#for w in e.getWordList():
	#	print e.frequency(w), '\t', w

Exemplo n.º 8
0
 def _gen_ngrams(sent, n):
     ngrams = [' '.join(ngram) for ngram in ng(sent, n)]
     return ngrams
            return getFrequenciesList

    def getTotalCount(self):
        """
			getTotalCount() returns a floating point number of the total number of tokens in this textual data set.
		"""
        return sum([value * 1.0 for value in self.getCounts().values()])


if __name__ == "__main__":
    import os, sys, codecs

    dataSource = "/Users/ducrix/Documents/Research/Python/data/ceb2.txt"
    #data = "this is a test text sample. It should contain a good good good amount of words because it has to be use for a test. I am going to try to use as full of words as possible possible possible."
    data = open(dataSource, 'r').read()
    #data = codecs.open(dataSource, 'r', 'utf8').read()

    e = Estimator(text=data)
    freq = e.getFrequencies(returnType=1)
    #print "count()\t\t\t\t", e.count("")
    #print "getCounts()\t\t\t", e.getCounts(returnType = 1)
    #print "frequency()\t\t\t", e.frequency("to")
    #print "getFrequencies()\t", len(freq)
    #print "getTotalCount()\t\t", e.getTotalCount()
    print "getNgramsCount()\t\t\t", e.getNgramsCount('a')
    print ng("this is a test", 3)
    #print "getNgramsFrequencies()\t", e.getNgramsFrequencies()
    #for w in e.getWordList():
    #	print e.frequency(w), '\t', w
Exemplo n.º 10
0
 def ngrams(self, n=5, v_idx=None):
     """
     Make a query for ngram frequency counter
     @takes:
         n :: N gram size (if n=5, [-2 -1 +1 +2])
         v_idx:: int, positional index of the checkpoint
     @returns:
         suf_ngram: {"suf_-2_the": 1, "suf_-1_cat": 1, ...}
         pos_ngram: {"suf_-2_DT": 1, "suf_-1_NN": 1, ...}
     """
     try:
         if not v_idx:
             v_idx = self.v_idx
         suf_ngram = {}
         pos_ngram = {}
         window = int((n - 1)/2)
         if not v_idx:
             v_idx = 0
         core = self.WL[v_idx]
         _lefts = [word for index, word in enumerate(self.SUF) if index < v_idx and index != v_idx][-window:]
         _leftp = [word for index, word in enumerate(self.POS) if index < v_idx and index != v_idx][-window:]
         _rights = [word for index, word in enumerate(self.SUF) if index > v_idx and index != v_idx][:window]
         _rightp = [word for index, word in enumerate(self.POS) if index > v_idx and index != v_idx][:window]
         concats = _lefts + ["*V*"] + _rights
         concatp = _leftp + ["*V*"] + _rightp
         suf_unigram = {SimpleFeatureExtractor.gen_fn(["SUF1G", str(i-window), "".join(w)]):1 
                     for i, w in enumerate(concats) if w != "*V*"}
         pos_unigram = {SimpleFeatureExtractor.gen_fn(["POS1G", str(i-window), "".join(w)]):1 
                     for i, w in enumerate(concatp) if w != "*V*"}
         suf_bigram = {SimpleFeatureExtractor.gen_fn(["SUF2G", "", "-".join(w)]):1 
                     for i, w in enumerate(ng(concats, 3)) if w[0] == "*V*" or w[2] == "*V*"} if n >= 5 else {}
         pos_bigram = {SimpleFeatureExtractor.gen_fn(["POS2G", "", "-".join(w)]):1 
                     for i, w in enumerate(ng(concatp, 3)) if w[0] == "*V*" or w[2] == "*V*"} if n >= 5 else {}
         suf_trigram = {SimpleFeatureExtractor.gen_fn(["SUF3G", "", "-".join(w)]):1 
                     for i, w in enumerate(ng(concats, 4)) if w[0] == "*V*" or w[3] == "*V*"} if n >= 7 else {}
         pos_trigram = {SimpleFeatureExtractor.gen_fn(["POS3G", "", "-".join(w)]):1 
                     for i, w in enumerate(ng(concatp, 4)) if w[0] == "*V*" or w[3] == "*V*"} if n >= 7 else {}
         suf_c3gram = {SimpleFeatureExtractor.gen_fn(["SUF3G", "", "-".join(w)]):1 
                     for i, w in enumerate(ng(concats, 3)) if w[1] == "*V*"} if n >= 3 else {}
         # suf_c5gram = {SimpleFeatureExtractor.gen_fn(["SUF5G", "", "-".join(w)]):1 
         #             for i, w in enumerate(ng(concats, 5)) if w[2] == "*V*"} if n >= 5 else {}
         # suf_c7gram = {SimpleFeatureExtractor.gen_fn(["SUF7G", "", "-".join(w)]):1 
                     # for i, w in enumerate(ng(concats, 7)) if w[3] == "*V*"} if n >= 7 else {}
         pos_c3gram = {SimpleFeatureExtractor.gen_fn(["POS3G", "", "-".join(w)]):1 
                     for i, w in enumerate(ng(concatp, 3)) if w[1] == "*V*"} if n >= 3 else {}
         # pos_c5gram = {SimpleFeatureExtractor.gen_fn(["POS5G", "", "-".join(w)]):1 
         #             for i, w in enumerate(ng(concatp, 5)) if w[2] == "*V*"} if n >= 5 else {}
         # pos_c7gram = {SimpleFeatureExtractor.gen_fn(["POS7G", "", "-".join(w)]):1 
                     # for i, w in enumerate(ng(concatp, 7)) if w[3] == "*V*"} if n >= 7 else {}
         self.features.update(suf_unigram)
         self.features.update(pos_unigram)
         self.features.update(suf_bigram)
         self.features.update(pos_bigram)
         self.features.update(suf_trigram)
         self.features.update(pos_trigram)
         self.features.update(suf_c3gram)
         # self.features.update(suf_c5gram)
         # self.features.update(suf_c7gram)
         self.features.update(pos_c3gram)
         # self.features.update(pos_c5gram)
         # self.features.update(pos_c7gram)
     except Exception, e:
         pass
Exemplo n.º 11
0
 def _gen_subsets(ngram: str):
     words = ngram.split(' ')
     n = len(words)
     for sub in ng(words, n - 1):
         yield ' '.join(sub)