示例#1
0
def sentence_to_word(sentence, lang):

    if(lang=='en'):
        words = nltk.word_tokenize(sentence)
        return [e.lower() for e in words]
    else:
        tokenizer = SinhalaTokenizer()

        return tokenizer.tokenize(sentence)
示例#2
0
def get_sn_process_setup():
    """
    Returns a tokenizer and stemmer object for sinhala language
    """

    tokenizer = SinhalaTokenizer()
    stemmer = sinhalaStemmer.stemmer()

    return tokenizer, stemmer
class SyntaxAnalysis:

    def __init__(self):
        self.__tokenizer = SinhalaTokenizer()
        self.__tagger = POSTagger()
        self.__stemming = Stemming()

    def pos_tagger(self, sentence):

        # tokenization
        tokens = [self.__tokenizer.tokenize(f'{ss}.') for ss in self.__tokenizer.split_sentences(sentence)]

        # Stemming
        stems = []
        for token in tokens[0]:
            stems.append(self.__stemming.find_root(token))

        # POS Tagging
        pos_tags = self.__tagger.predict([stems])
        return pos_tags
from elasticsearch import Elasticsearch
from sinling import SinhalaTokenizer
from sinling import word_splitter

es = Elasticsearch()
tokenizer = SinhalaTokenizer()
artist_name_boosters = [
    'ගේ', 'කීව', 'කී', 'ගායනා කරන', 'ගයන', 'ගායනා', '‌ගේ', 'හඩින්', 'කියනා',
    'කිව්ව', 'කිව්', 'කිව', 'ගායනය', 'ගායනා කළා', 'ගායනා කල', 'ගැයූ'
]
writer_name_boosters = [
    'ලියා', 'ලියූ', 'ලිව්ව', 'ලිව්', 'රචනා', 'ලියා ඇති', 'රචිත', 'ලියන ලද',
    'ලියන', 'හදපු', 'පද', 'රචනය', 'හැදූ', 'හැදුව', 'ලියන', 'ලියන්න', 'ලීව',
    'ලියපු', 'ලියා ඇත', 'ලිඛිත'
]
music_boosters = ["වාදනය", "සංගීතය", "නාද"]
movie_boosters = ['චිත්‍රපට', 'සිනමා']
genre_boosters = [
    'දේවානුභාවයෙ', 'පැරණි', 'පොප්ස්', 'පොප්', 'පරණ', 'ක්ලැසික්', 'ක්ලැසි',
    'ඉල්ලීම', 'චිත්‍රපට', 'නව', 'වර්ගයේ', 'අයත්', 'වර්ගයට', 'කණ්ඩායම්', 'යුගල'
]
views_boosters = [
    'සුපිරි', 'නියම', 'ප්‍රමුඛතම', 'පට්ට', 'ඉහළම', 'ගෝල්ඩන්', 'හොඳ', 'හොඳම',
    'එලකිරි', 'එළකිරි', 'සුප්පර්', 'සුප්රකට', 'ඉහල', 'වැඩිපුර', 'වැඩිපුරම',
    'සුප්‍රකට', 'ජනප්රිය', 'ජනප්රියම', 'ජනප්‍රිය', 'ජනප්‍රියම', 'ප්‍රකට',
    'ප්‍රසිද්ධ'
]
boosts_default = {
    "title_sinhala": 1,
    "artist_name": 1,
    "writer_name": 1,
示例#5
0
def get_sn_process_setup():

    tokenizer = SinhalaTokenizer()
    stemmer = sinhalaStemmer.stemmer()

    return tokenizer, stemmer
class QueryProcessor:

    def __init__(self):
        self.tokenizer = SinhalaTokenizer()
        self.es = Elasticsearch()
        self.index = "160376l-ssb-data-2020-modified-index7"
        self.translation_dict = {}

    # Translate a word from English to Sinhala
    def translate_word(self, word):
        translated = translate(word, 'si', 'en')
        return translated

    # Translate an array from English to Sinhala
    def translate_array(self, wordlist):
        isAscii = lambda s: len(s) == len(s.encode())
        translated_array = []
        for i in wordlist:
            # Check if word is made of ASCII Letters
            if isAscii(i):
                if i in self.translation_dict.keys():
                    translated_array.append(self.translation_dict.get(i))
                else:
                    translated_phrase = self.translate_word(i)
                    self.translation_dict[i] = translated_phrase
                    translated_array.append(translated_phrase)
        return translated_array

    # Genereate ES Query for Advacned Queries
    def advancedQuery(self, queryDictionary):
        multTermValue = []
        for i in queryDictionary:
            if (queryDictionary[i] != None and queryDictionary[i] != ""):
                queryDictionary[i] = queryDictionary[i].replace("."," ")
                tokens = self.tokenizer.tokenize(queryDictionary[i])
                tokens.extend(self.translate_array(tokens))
                stemmed_tokens = self.stemming(tokens)
                act = self.autocorrect(stemmed_tokens)
                flat_list_act = []
                for sublist in act:
                    for item in sublist:
                        flat_list_act.append(item)
                flat_list_act.append(queryDictionary[i])
                multTermValue.append({"terms": {i: flat_list_act, "boost": 2}})
        # Generate a ES Boolean Query
        res = self.es.search(
            index=self.index,
            body=
            {
                "query":
                    {
                        "bool": {
                            "should": multTermValue,
                        }
                    },
                "size": 100,
                "aggs": {
                    "Artist Filter": {
                        "terms": {
                            "field": "artist.keyword",
                            "size": 10
                        }
                    },
                    "Composer Filter": {
                        "terms": {
                            "field": "composer.keyword",
                            "size": 10
                        }
                    },
                    "Genre Filter": {
                        "terms": {
                            "field": "genre.keyword",
                            "size": 10
                        }
                    },
                    "Movie Filter": {
                        "terms": {
                            "field": "movie.keyword",
                            "size": 10
                        }
                    },
                    "Writer Filter": {
                        "terms": {
                            "field": "writer.keyword",
                            "size": 10
                        }
                    },
                    "Key Filter": {
                        "terms": {
                            "field": "key.keyword",
                            "size": 10
                        }
                    },
                    "Beat Filter": {
                        "terms": {
                            "field": "beat.keyword",
                            "size": 10
                        }
                    },
                    "View Filter": {
                        "range": {
                            "field": "views",
                            "ranges": [
                                {
                                    "from": 0,
                                    "to": 1000
                                },
                                {
                                    "from": 1000,
                                    "to": 2000
                                },
                                {
                                    "from": 2000,
                                    "to": 3000
                                },
                                {
                                    "from": 3000
                                }
                            ]
                        }
                    }
                }
            }
        )
        results = res
        return results

    # Generate ES Query with Boosted MetaData tags
    def generateTermsMultipleQuery(self, flat_list_act, fields, classDict, searchQuery):
        multTermValue = []
        sorted = False
        sortedsimpleQuery = False
        addedFields = []
        for i in fields:
            if (i != "popularity"):
                multTermValue.append({"terms": {i: flat_list_act, "boost": classDict[i] + 1}})
                addedFields.append(i)
            else:
                sorted = True
                if (len(fields) == 1):
                    sortedsimpleQuery = True
        for i in ["writer", "composer", "artist", "genre", "key", "beat", "movie"]:
            if i not in addedFields:
                multTermValue.append({"terms": {i: flat_list_act, "boost": 1}})
        if (not sorted):
            for i in ["title", "songLyricsSearchable"]:
                if i not in addedFields:
                    multTermValue.append({"terms": {i: flat_list_act}})
                    multTermValue.append({"match_phrase": {i: searchQuery}})
        # If no ranking terms (i.e හොඳම) have been provided do not sort results
        if (not sorted):
            res = self.es.search(
                index=self.index,
                body=
                {
                    "query":
                        {
                            "bool": {
                                "should": multTermValue,
                            }
                        },
                    "size": 100,
                    "aggs": {
                        "Artist Filter": {
                            "terms": {
                                "field": "artist.keyword",
                                "size": 10
                            }
                        },
                        "Composer Filter": {
                            "terms": {
                                "field": "composer.keyword",
                                "size": 10
                            }
                        },
                        "Genre Filter": {
                            "terms": {
                                "field": "genre.keyword",
                                "size": 10
                            }
                        },
                        "Movie Filter": {
                            "terms": {
                                "field": "movie.keyword",
                                "size": 10
                            }
                        },
                        "Writer Filter": {
                            "terms": {
                                "field": "writer.keyword",
                                "size": 10
                            }
                        },
                        "Key Filter": {
                            "terms": {
                                "field": "key.keyword",
                                "size": 10
                            }
                        },
                        "Beat Filter": {
                            "terms": {
                                "field": "beat.keyword",
                                "size": 10
                            }
                        },
                        "View Filter": {
                            "range": {
                                "field": "views",
                                "ranges": [
                                    {
                                        "from": 0,
                                        "to": 1000
                                    },
                                    {
                                        "from": 1000,
                                        "to": 2000
                                    },
                                    {
                                        "from": 2000,
                                        "to": 3000
                                    },
                                    {
                                        "from": 3000
                                    }
                                ]
                            }
                        }
                    }
                }
            )
        # If ranking terms (i.e හොඳම) have been provided sort results based on the function score
        elif (sorted and not sortedsimpleQuery):
            res = self.es.search(
                index=self.index,
                body=
                {
                    "query": {
                        "function_score": {
                            "functions": [
                                {
                                    "field_value_factor": {
                                        "field": "views",
                                        "factor": 1.0001,
                                        "missing": 1
                                    }
                                }
                            ],
                            "query":
                                {
                                    "bool": {
                                        "should": multTermValue
                                    }
                                },
                            # "sort": [
                            #     "_score",
                            #     {"views":  { "order": "desc" }}
                            # ],

                            "score_mode": "multiply"
                        },

                    },
                    "size": 100,
                    "aggs": {
                        "Artist Filter": {
                            "terms": {
                                "field": "artist.keyword",
                                "size": 10
                            }
                        },
                        "Composer Filter": {
                            "terms": {
                                "field": "composer.keyword",
                                "size": 10
                            }
                        },
                        "Genre Filter": {
                            "terms": {
                                "field": "genre.keyword",
                                "size": 10
                            }
                        },
                        "Movie Filter": {
                            "terms": {
                                "field": "movie.keyword",
                                "size": 10
                            }
                        },
                        "Writer Filter": {
                            "terms": {
                                "field": "writer.keyword",
                                "size": 10
                            }
                        },
                        "Key Filter": {
                            "terms": {
                                "field": "key.keyword",
                                "size": 10
                            }
                        },
                        "Beat Filter": {
                            "terms": {
                                "field": "beat.keyword",
                                "size": 10
                            }
                        },
                        "View Filter": {
                            "range": {
                                "field": "views",
                                "ranges": [
                                    {
                                        "from": 0,
                                        "to": 1000
                                    },
                                    {
                                        "from": 1000,
                                        "to": 2000
                                    },
                                    {
                                        "from": 2000,
                                        "to": 3000
                                    },
                                    {
                                        "from": 3000
                                    }
                                ]
                            }
                        }
                    }

                }
            )
        elif (sorted and sortedsimpleQuery):
            res = self.es.search(
                index=self.index,
                body=
                {
                    "query": {
                        "match_all" : {}
                         
                    },
                    "sort": [
                        {"views":  { "order": "desc" }}
                    ],
                    "size": 100,
                    "aggs": {
                        "Artist Filter": {
                            "terms": {
                                "field": "artist.keyword",
                                "size": 10
                            }
                        },
                        "Composer Filter": {
                            "terms": {
                                "field": "composer.keyword",
                                "size": 10
                            }
                        },
                        "Genre Filter": {
                            "terms": {
                                "field": "genre.keyword",
                                "size": 10
                            }
                        },
                        "Movie Filter": {
                            "terms": {
                                "field": "movie.keyword",
                                "size": 10
                            }
                        },
                        "Writer Filter": {
                            "terms": {
                                "field": "writer.keyword",
                                "size": 10
                            }
                        },
                        "Key Filter": {
                            "terms": {
                                "field": "key.keyword",
                                "size": 10
                            }
                        },
                        "Beat Filter": {
                            "terms": {
                                "field": "beat.keyword",
                                "size": 10
                            }
                        },
                        "View Filter": {
                            "range": {
                                "field": "views",
                                "ranges": [
                                    {
                                        "from": 0,
                                        "to": 1000
                                    },
                                    {
                                        "from": 1000,
                                        "to": 2000
                                    },
                                    {
                                        "from": 2000,
                                        "to": 3000
                                    },
                                    {
                                        "from": 3000
                                    }
                                ]
                            }
                        }
                    }

                }
            )
        results = res
        return results

    # Generate a Normal Query with no Boosted MetaData tags
    def generateNormalQuery(self, flat_list_act, searchQuery):
        print("[INFO] Generating Normal Query")
        multTermValue = []
        for i in ["artist", "writer", "genre", "composer", "title", "songLyricsSearchable", "movie", "beat", "key"]:
            multTermValue.append({"terms": {i: flat_list_act, "boost": 1}})
            multTermValue.append({"match_phrase": {i: searchQuery}})
        print(multTermValue)
        res = self.es.search(
            index=self.index,
            body=
            {
                "query":
                    {
                        "bool": {
                            "should": multTermValue
                        }
                    },
                "size": 100,
                "aggs": {
                    "Artist Filter": {
                        "terms": {
                            "field": "artist.keyword",
                            "size": 10
                        }
                    },
                    "Composer Filter": {
                        "terms": {
                            "field": "composer.keyword",
                            "size": 10
                        }
                    },
                    "Genre Filter": {
                        "terms": {
                            "field": "genre.keyword",
                            "size": 10
                        }
                    },
                    "Movie Filter": {
                        "terms": {
                            "field": "movie.keyword",
                            "size": 10
                        }
                    },
                    "Writer Filter": {
                        "terms": {
                            "field": "writer.keyword",
                            "size": 10
                        }
                    },
                    "Key Filter": {
                        "terms": {
                            "field": "key.keyword",
                            "size": 10
                        }
                    },
                    "Beat Filter": {
                        "terms": {
                            "field": "beat.keyword",
                            "size": 10
                        }
                    },
                    "View Filter": {
                        "range": {
                            "field": "views",
                            "ranges": [
                                {
                                    "from": 0,
                                    "to": 1000
                                },
                                {
                                    "from": 1000,
                                    "to": 2000
                                },
                                {
                                    "from": 2000,
                                    "to": 3000
                                },
                                {
                                    "from": 3000
                                }
                            ]
                        }
                    }
                }
            }
        )
        results = res
        return results

    # Analyze the word and generate appropriate keyword for user's search request
    def generateQuery(self, searchQuery):
        print("[INFO] Generating Query")
        searchQuery = searchQuery.replace("."," ")
        tokens = self.tokenizer.tokenize(searchQuery)
        tokens.extend(self.translate_array(tokens))
        stemmed_tokens = self.stemming(tokens)
        act = self.autocorrect(stemmed_tokens)
        flat_list_act = []
        for sublist in act:
            for item in sublist:
                flat_list_act.append(item)
        classDict = self.searchClassification(act)
        if (len(classDict) <= 0):
            results = self.generateNormalQuery(flat_list_act, searchQuery)
            # self.generateMLTQuery(searchQuery, ["artist","songLyricsSearchable","writer","composer","genre"])
        else:
            rankedlist = []
            for i in classDict:
                if (i in ["writer", "composer", "artist", "genre", "popularity", "key", "beat", "movie"]):
                    rankedlist.append(i)
                    if i == "key":
                        p = re.compile(r"[A-G,a-g][b,#]{0,1} (major|minor|Major|Major)")
                        r = p.search(searchQuery)
                        flat_list_act.append(r[0])
                    if i == "beat":
                        p = re.compile(r"\b[0-9]{1,2}\/[0-9]{1,2}")
                        r = p.findall(searchQuery)
                        flat_list_act.append(r[0])
            if (len(rankedlist) > 0):
                results = self.generateTermsMultipleQuery(flat_list_act, rankedlist, classDict, searchQuery)
            # self.generateFuzzyQuery()
        return results

    def getSubsets(self, iterable):
        return chain.from_iterable(combinations(iterable, r) for r in range(len(iterable) + 1))

    # Observe tokens and return query type
    # If special keyword boosting terms exist return the presence of those keywords
    def searchClassification(self, tokens):

        synonyms = "resource_files/synonyms.txt"
        try:
            synonymsFile = io.open(synonyms, "r", encoding='utf-8').read()
        except UnicodeDecodeError:
            synonymsFile = io.open(synonyms, "r", encoding='latin-1').read()
        synonymsList = synonymsFile.split("\n")
        synonymsDict = {}
        for i in synonymsList:
            splitSynonymLine = i.split(":")
            try:
                synonymsDict[splitSynonymLine[0]] = splitSynonymLine[1].split(",")
            except:
                print()
        rankedQuery = {}
        for corrected_tokens in tokens:
            for token in corrected_tokens:
                foundsynonym = False
                for key in synonymsDict:
                    if token in synonymsDict[key]:
                        if rankedQuery.get(key) == None:
                            rankedQuery[key] = 1
                        else:
                            rankedQuery[key] = rankedQuery[key] + 1
                        foundsynonym = True
                if foundsynonym:
                    break
        return rankedQuery

    # Looks at basic error rules within the Sinhala Lanugage and appends likely errors
    def autocorrect(self, tokens):
        allWords = [[] for i in range(len(tokens))]
        missFileDirec = "resource_files/mispellings.txt"
        try:
            misspellingsFile = io.open(missFileDirec, "r", encoding='utf-8').read()
        except UnicodeDecodeError:
            misspellingsFile = io.open(missFileDirec, "r", encoding='latin-1').read()
        missList = misspellingsFile.split()
        missListSet = []
        for i in missList:
            missListSet.append(i.split(','))
        for token_number in range(len(tokens)):
            token = tokens[token_number]
            missListForWord = []
            for misspellPairs in missListSet:
                if misspellPairs[0] in token or misspellPairs[1] in token:
                    missListForWord.append(misspellPairs)
            print(list(self.getSubsets(missListForWord)))
            for j in list(self.getSubsets(missListForWord)):
                k = token
                for d in list(j):
                    if d[0] in k:
                        k = k.replace(d[0], d[1])
                    elif d[1] in token:
                        k = k.replace(d[1], d[0])
                allWords[token_number].append(k)
        return allWords

    def rreplace(self, s, old, new, occurrence):
        li = s.rsplit(old, occurrence)
        return new.join(li)

    # Reduce strings to simple formats based on rules
    def stemming(self, doc):
        suffFileDirec = "resource_files/suffixes.txt"
        try:
            suffixFile = io.open(suffFileDirec, "r", encoding='utf-8').read()
        except UnicodeDecodeError:
            suffixFile = io.open(suffFileDirec, "r", encoding='latin-1').read()

        suffixList = suffixFile.split()

        doc.sort()
        stemmedWordlist = []
        stemmedWordlist.extend(doc)

        for i in doc:
            for j in suffixList:
                if i.endswith(j):
                    stemmedWordlist.append(self.rreplace(i, j, "", 1))
        return stemmedWordlist
 def __init__(self):
     self.tokenizer = SinhalaTokenizer()
     self.es = Elasticsearch()
     self.index = "160376l-ssb-data-2020-modified-index7"
     self.translation_dict = {}
示例#8
0
class QueryProcessor:
    def __init__(self):
        self.tokenizer = SinhalaTokenizer()
        self.es = Elasticsearch()
        self.index = "160376l-ssb-data-2020-modified-index4"
        self.translation_dict = {}

    def translate_word(self, word):
        translated = translate(word, 'si', 'en')
        return translated

    def translate_array(self, wordlist):
        isascii = lambda s: len(s) == len(s.encode())
        translated_array = []
        for i in wordlist:
            if isascii(i):
                if i in self.translation_dict.keys():
                    translated_array.append(self.translation_dict.get(i))
                else:
                    translated_phrase = self.translate_word(i)
                    self.translation_dict[i] = translated_phrase
                    translated_array.append(translated_phrase)
        return translated_array

    def advancedQuery(self, queryDictionary):

        multTermValue = []
        for i in queryDictionary:
            if (queryDictionary[i] != None and queryDictionary[i] != ""):
                tokens = self.tokenizer.tokenize(queryDictionary[i])
                tokens.extend(self.translate_array(tokens))
                stemmed_tokens = self.stemming(tokens)
                act = self.autocorrect(stemmed_tokens)
                flat_list_act = []
                for sublist in act:
                    for item in sublist:
                        flat_list_act.append(item)
                flat_list_act.append(queryDictionary[i])
                multTermValue.append({"terms": {i: flat_list_act, "boost": 2}})
        print(multTermValue)
        res = self.es.search(index=self.index,
                             body={
                                 "query": {
                                     "bool": {
                                         "should": multTermValue,
                                     }
                                 },
                                 "size": 100,
                                 "aggs": {
                                     "Artist Filter": {
                                         "terms": {
                                             "field": "artist.keyword",
                                             "size": 10
                                         }
                                     },
                                     "Composer Filter": {
                                         "terms": {
                                             "field": "composer.keyword",
                                             "size": 10
                                         }
                                     },
                                     "Genre Filter": {
                                         "terms": {
                                             "field": "genre.keyword",
                                             "size": 10
                                         }
                                     },
                                     "Movie Filter": {
                                         "terms": {
                                             "field": "movie.keyword",
                                             "size": 10
                                         }
                                     },
                                     "Writer Filter": {
                                         "terms": {
                                             "field": "writer.keyword",
                                             "size": 10
                                         }
                                     },
                                     "Key Filter": {
                                         "terms": {
                                             "field": "key.keyword",
                                             "size": 10
                                         }
                                     },
                                     "Beat Filter": {
                                         "terms": {
                                             "field": "beat.keyword",
                                             "size": 10
                                         }
                                     },
                                     "View Filter": {
                                         "range": {
                                             "field":
                                             "views",
                                             "ranges": [{
                                                 "from": 0,
                                                 "to": 1000
                                             }, {
                                                 "from": 1000,
                                                 "to": 2000
                                             }, {
                                                 "from": 2000,
                                                 "to": 3000
                                             }, {
                                                 "from": 3000
                                             }]
                                         }
                                     }
                                 }
                             })
        results = res
        return results

    def generateMLTQuery(self, searchQuery, rankedlist):
        print("[INFO] Generating ranked Query")
        res = self.es.search(index=self.index,
                             body={
                                 "query": {
                                     "more_like_this": {
                                         "fields": rankedlist,
                                         "like": searchQuery,
                                         "min_term_freq": 1,
                                         "max_query_terms": 12
                                     }
                                 }
                             })
        results = res['hits']['hits']
        return results

    def generateTermsMultipleQuery(self, flat_list_act, fields, classDict):
        multTermValue = []
        sorted = False
        addedFields = []
        for i in fields:
            if (i != "popularity"):
                multTermValue.append(
                    {"terms": {
                        i: flat_list_act,
                        "boost": classDict[i] + 1
                    }})
                addedFields.append(i)
            else:
                sorted = True
                if (len(fields) == 1):
                    for i in [
                            "writer", "composer", "artist", "genre", "key",
                            "beat", "movie"
                    ]:
                        for b in flat_list_act:
                            wildcardString = "*" + b + "*"
                            multTermValue.append(
                                {"wildcard": {
                                    i: wildcardString
                                }})
        for i in [
                "writer", "composer", "artist", "genre", "key", "beat", "movie"
        ]:
            if i not in addedFields:
                for b in flat_list_act:
                    wildcardString = "*" + b + "*"
                    multTermValue.append({"wildcard": {i: wildcardString}})
        if (not sorted):
            for i in ["title", "songLyricsSearchable"]:
                multTermValue.append({"terms": {i: flat_list_act}})
                multTermValue.append({"match_phrase": {i: searchQuery}})
        print(multTermValue)
        if (not sorted):
            res = self.es.search(index=self.index,
                                 body={
                                     "query": {
                                         "bool": {
                                             "should": multTermValue,
                                         }
                                     },
                                     "size": 100,
                                     "aggs": {
                                         "Artist Filter": {
                                             "terms": {
                                                 "field": "artist.keyword",
                                                 "size": 10
                                             }
                                         },
                                         "Composer Filter": {
                                             "terms": {
                                                 "field": "composer.keyword",
                                                 "size": 10
                                             }
                                         },
                                         "Genre Filter": {
                                             "terms": {
                                                 "field": "genre.keyword",
                                                 "size": 10
                                             }
                                         },
                                         "Movie Filter": {
                                             "terms": {
                                                 "field": "movie.keyword",
                                                 "size": 10
                                             }
                                         },
                                         "Writer Filter": {
                                             "terms": {
                                                 "field": "writer.keyword",
                                                 "size": 10
                                             }
                                         },
                                         "Key Filter": {
                                             "terms": {
                                                 "field": "key.keyword",
                                                 "size": 10
                                             }
                                         },
                                         "Beat Filter": {
                                             "terms": {
                                                 "field": "beat.keyword",
                                                 "size": 10
                                             }
                                         },
                                         "View Filter": {
                                             "range": {
                                                 "field":
                                                 "views",
                                                 "ranges": [{
                                                     "from": 0,
                                                     "to": 1000
                                                 }, {
                                                     "from": 1000,
                                                     "to": 2000
                                                 }, {
                                                     "from": 2000,
                                                     "to": 3000
                                                 }, {
                                                     "from": 3000
                                                 }]
                                             }
                                         }
                                     }
                                 })
        else:
            print("run query")
            res = self.es.search(index=self.index,
                                 body={
                                     "query": {
                                         "bool": {
                                             "should": multTermValue
                                         }
                                     },
                                     "sort":
                                     ["_score", {
                                         "views": {
                                             "order": "desc"
                                         }
                                     }],
                                     "size":
                                     100,
                                     "aggs": {
                                         "Artist Filter": {
                                             "terms": {
                                                 "field": "artist.keyword",
                                                 "size": 10
                                             }
                                         },
                                         "Composer Filter": {
                                             "terms": {
                                                 "field": "composer.keyword",
                                                 "size": 10
                                             }
                                         },
                                         "Genre Filter": {
                                             "terms": {
                                                 "field": "genre.keyword",
                                                 "size": 10
                                             }
                                         },
                                         "Movie Filter": {
                                             "terms": {
                                                 "field": "movie.keyword",
                                                 "size": 10
                                             }
                                         },
                                         "Writer Filter": {
                                             "terms": {
                                                 "field": "writer.keyword",
                                                 "size": 10
                                             }
                                         },
                                         "Key Filter": {
                                             "terms": {
                                                 "field": "key.keyword",
                                                 "size": 10
                                             }
                                         },
                                         "Beat Filter": {
                                             "terms": {
                                                 "field": "beat.keyword",
                                                 "size": 10
                                             }
                                         },
                                         "View Filter": {
                                             "range": {
                                                 "field":
                                                 "views",
                                                 "ranges": [{
                                                     "from": 0,
                                                     "to": 1000
                                                 }, {
                                                     "from": 1000,
                                                     "to": 2000
                                                 }, {
                                                     "from": 2000,
                                                     "to": 3000
                                                 }, {
                                                     "from": 3000
                                                 }]
                                             }
                                         }
                                     }
                                 })
        results = res
        return results

    def generateTermsSingleQuery(self, flat_list_act, field):
        res = self.es.search(index=self.index,
                             body={"query": {
                                 "terms": {
                                     field: flat_list_act
                                 }
                             }})
        results = res['hits']['hits']
        return results

    def generateNormalQuery(self, flat_list_act, searchQuery):
        print("[INFO] Generating Normal Query")
        multTermValue = []
        for i in [
                "artist", "writer", "genre", "composer", "title",
                "songLyricsSearchable", "movie", "beat", "key"
        ]:
            for b in flat_list_act:
                wildcardString = "*" + b + "*"
                multTermValue.append({"wildcard": {i: wildcardString}})
            multTermValue.append({"match_phrase": {i: searchQuery}})
        print(multTermValue)
        res = self.es.search(index=self.index,
                             body={
                                 "query": {
                                     "bool": {
                                         "should": multTermValue
                                     }
                                 },
                                 "size": 100,
                                 "aggs": {
                                     "Artist Filter": {
                                         "terms": {
                                             "field": "artist.keyword",
                                             "size": 10
                                         }
                                     },
                                     "Composer Filter": {
                                         "terms": {
                                             "field": "composer.keyword",
                                             "size": 10
                                         }
                                     },
                                     "Genre Filter": {
                                         "terms": {
                                             "field": "genre.keyword",
                                             "size": 10
                                         }
                                     },
                                     "Movie Filter": {
                                         "terms": {
                                             "field": "movie.keyword",
                                             "size": 10
                                         }
                                     },
                                     "Writer Filter": {
                                         "terms": {
                                             "field": "writer.keyword",
                                             "size": 10
                                         }
                                     },
                                     "Key Filter": {
                                         "terms": {
                                             "field": "key.keyword",
                                             "size": 10
                                         }
                                     },
                                     "Beat Filter": {
                                         "terms": {
                                             "field": "beat.keyword",
                                             "size": 10
                                         }
                                     },
                                     "View Filter": {
                                         "range": {
                                             "field":
                                             "views",
                                             "ranges": [{
                                                 "from": 0,
                                                 "to": 1000
                                             }, {
                                                 "from": 1000,
                                                 "to": 2000
                                             }, {
                                                 "from": 2000,
                                                 "to": 3000
                                             }, {
                                                 "from": 3000
                                             }]
                                         }
                                     }
                                 }
                             })
        results = res
        return results

    def generateQuery(self, searchQuery):
        print("[INFO] Generating Query")
        tokens = self.tokenizer.tokenize(searchQuery)
        tokens.extend(self.translate_array(tokens))
        stemmed_tokens = self.stemming(tokens)
        #act = self.autocorrect(stemmed_tokens)
        act = [[i] for i in stemmed_tokens]
        flat_list_act = []
        for sublist in act:
            for item in sublist:
                flat_list_act.append(item)
        classDict = self.searchClassification(act)
        if (len(classDict) <= 0):
            results = self.generateNormalQuery(flat_list_act, searchQuery)
            # self.generateMLTQuery(searchQuery, ["artist","songLyricsSearchable","writer","composer","genre"])
        else:
            rankedlist = []
            for i in classDict:
                if (i in [
                        "writer", "composer", "artist", "genre", "popularity",
                        "key", "beat", "movie"
                ]):
                    rankedlist.append(i)
                    if i == "key":
                        p = re.compile(
                            r"[A-G,a-g][b,#]{0,1} (major|minor|Major|Major)")
                        r = p.search(searchQuery)
                        flat_list_act.append(r[0])
                    if i == "beat":
                        p = re.compile(r"\b[0-9]{1,2}\/[0-9]{1,2}")
                        r = p.findall(searchQuery)
                        flat_list_act.append(r[0])
            if (len(rankedlist) > 0):
                results = self.generateTermsMultipleQuery(
                    flat_list_act, rankedlist, classDict)
            # self.generateFuzzyQuery()
        return results

    def getSubsets(self, iterable):
        return chain.from_iterable(
            combinations(iterable, r) for r in range(len(iterable) + 1))

    # Observe tokens and return query type as 'Normal Lyric Search', 'Feature Search', 'Ranked Feature Search'
    def searchClassification(self, tokens):

        synonyms = "synonyms.txt"
        try:
            synonymsFile = io.open(synonyms, "r", encoding='utf-8').read()
        except UnicodeDecodeError:
            synonymsFile = io.open(synonyms, "r", encoding='latin-1').read()
        synonymsList = synonymsFile.split("\n")
        synonymsDict = {}
        for i in synonymsList:
            splitSynonymLine = i.split(":")
            try:
                synonymsDict[splitSynonymLine[0]] = splitSynonymLine[1].split(
                    ",")
            except:
                print()
        rankedQuery = {}
        for corrected_tokens in tokens:
            for token in corrected_tokens:
                foundsynonym = False
                for key in synonymsDict:
                    if token in synonymsDict[key]:
                        if rankedQuery.get(key) == None:
                            rankedQuery[key] = 1
                        else:
                            rankedQuery[key] = rankedQuery[key] + 1
                        foundsynonym = True
                if foundsynonym:
                    break
        return rankedQuery

    # Looks at basic error rules within the Sinhala Lanugage and appends likely errors
    def autocorrect(self, tokens):
        allWords = [[] for i in range(len(tokens))]
        missFileDirec = "mispellings.txt"
        try:
            misspellingsFile = io.open(missFileDirec, "r",
                                       encoding='utf-8').read()
        except UnicodeDecodeError:
            misspellingsFile = io.open(missFileDirec, "r",
                                       encoding='latin-1').read()
        missList = misspellingsFile.split()
        missListSet = []
        for i in missList:
            missListSet.append(i.split(','))
        for token_number in range(len(tokens)):
            token = tokens[token_number]
            missListForWord = []
            for misspellPairs in missListSet:
                if misspellPairs[0] in token or misspellPairs[1] in token:
                    missListForWord.append(misspellPairs)
            for j in list(self.getSubsets(missListForWord)):
                for d in list(j):
                    if d[0] in token:
                        token = token.replace(d[0], d[1])
                    elif d[1] in token:
                        token = token.replace(d[1], d[0])
                allWords[token_number].append(token)
        return allWords

    def rreplace(self, s, old, new, occurrence):
        li = s.rsplit(old, occurrence)
        return new.join(li)

    # Reduce strings to simple formats based on rules
    def stemming(self, doc):
        suffFileDirec = "suffixes.txt"
        try:
            suffixFile = io.open(suffFileDirec, "r", encoding='utf-8').read()
        except UnicodeDecodeError:
            suffixFile = io.open(suffFileDirec, "r", encoding='latin-1').read()

        suffixList = suffixFile.split()

        doc.sort()
        stemmedWordlist = []
        stemmedWordlist.extend(doc)

        for i in doc:
            for j in suffixList:
                if i.endswith(j):
                    stemmedWordlist.append(self.rreplace(i, j, "", 1))
        return stemmedWordlist
示例#9
0
            features['BOS'] = True
        if i < len(sent) - 1:
            word_next = sent[i + 1]
            features.update({
                f'+1:word': word_next,
                '+1:word.isdigit()': word_next.isdigit(),
            })
        else:
            features['EOS'] = True
        return features


if __name__ == '__main__':
    from sinling import SinhalaTokenizer

    tokenizer = SinhalaTokenizer()

    document = 'මනුෂ්‍යයා අවුරුදු ලක්ෂ ගණනක සිට වෛරස් වසංගත නිසා එළිපිටම පීඩා විඳි සත්වයෙකි. ' \
               'ඇතැම් වෛරස් රෝග වලට වැක්සීන හෙවත් එන්නත් ද වෛරස් නාශක ඖෂධ ද තිබුනද සියලූ‍ වෛරස් ' \
               'සම්බන්ධයෙන් ඒ න්‍යාය වැඩ කරන්නේ නැත. වසූරිය වෛරසය මිනිසා විසින් මිහිමතින් තුරන් කර තිබේ.'

    tokenized_sentences = [
        tokenizer.tokenize(f'{ss}.')
        for ss in tokenizer.split_sentences(document)
    ]

    tagger = POSTagger()

    pos_tags = tagger.predict(tokenized_sentences)

    for sent in pos_tags:
示例#10
0
from sinling import SinhalaTokenizer

tokenizer = SinhalaTokenizer()

sentence = 'එච්.ආර්.ජෝතිපාල'

s = tokenizer.tokenize(sentence)
print(s)

from SinhalaStemming import sinhalaStemmer

from SinhalaStemming import sinhalaStemmer
testx = sinhalaStemmer.stemmer()
print(testx)
 def __init__(self):
     self.__tokenizer = SinhalaTokenizer()
     self.__tagger = POSTagger()
     self.__stemming = Stemming()
示例#12
0
def search(term: str, count: int) -> List[SearchResult]:
    client = Elasticsearch()

    # Elasticsearch 6 requires the content-type header to be set, and this is
    # not included by default in the current version of elasticsearch-py
    client.transport.connection_pool.connection.headers.update(HEADERS)

    tokenizer = SinhalaTokenizer()

    terms = tokenizer.tokenize(term)

    print(terms)

    if (term == "songs" or terms == []):
        s = Search(using=client, index=INDEX_NAME)
        docs = s.query({"bool": {
            "must": [{
                "match_all": {}
            }]
        }})[:count].execute()
        return [SearchResult.from_doc(d) for d in docs]

    if ('top' in term and ('songs' in term or 'artist' in term)):
        if ('songs' in term):
            bool_query = {
                'bool': {
                    'must': {
                        'range': {
                            'track_rating.sort': {
                                'gte': 0
                            }
                        }
                    },
                }
            }
            s = Search(using=client, index="tokenized")
            docs = s.query(bool_query)[:count].sort(
                '-track_rating.sort').execute()
            return [SearchResult.from_doc(d) for d in docs]

        if ('artis' in term):
            bool_query = {
                'bool': {
                    'must': {
                        'range': {
                            'artist_rating': {
                                'gte': 0
                            }
                        }
                    },
                }
            }
            s = Search(using=client, index="tokenized")
            docs = s.query(bool_query)[:count].sort(
                '-artist_rating.sort').execute()
            return [SearchResult.from_doc(d) for d in docs]

    elif ('artist' in terms and ':' in terms):
        terms.remove('artist')
        terms.remove(':')
        term = " ".join(terms)
        print('artist got here ' + term)
        bool_query = {
            'bool': {
                'must': {
                    'match': {
                        'artist_name': {
                            'query': term,
                            'operator': 'and',
                            'fuzziness': 'AUTO'
                        }
                    }
                },
                'should': {
                    'multi_match': {
                        'query': term,
                        'fields': ['title^2', 'lyrics'],
                        'type': 'best_fields',
                        'operator': 'or'
                    }
                }
            }
        }
        s = Search(using=client, index="tokenized")
        docs = s.query(bool_query)[:count].execute()
        return [SearchResult.from_doc(d) for d in docs]

    elif ('lyrics' in terms and ':' in terms):
        terms.remove('lyrics')
        terms.remove(':')
        term = " ".join(terms)
        print('lyrics got here ' + term)
        bool_query = {
            'bool': {
                'must': {
                    'match': {
                        'lyrics': {
                            'query': term,
                            'operator': 'and',
                            'fuzziness': '2'
                        }
                    }
                },
                'should': {
                    'multi_match': {
                        'query': term,
                        'fields': ['title^3', 'artist_name'],
                        'type': 'best_fields',
                        'operator': 'and'
                    }
                }
            }
        }
        s = Search(using=client, index=INDEX_NAME)
        docs = s.query(bool_query)[:count].execute()
        return [SearchResult.from_doc(d) for d in docs]

    elif ('album' in terms and ':' in terms):
        terms.remove('album')
        terms.remove(':')
        term = " ".join(terms)
        print('albuns got here ' + term)
        bool_query = {
            'bool': {
                'must': {
                    'match': {
                        'album_name': {
                            'query': term,
                            'operator': 'and',
                            'fuzziness': 'AUTO'
                        }
                    }
                },
                'should': {
                    'multi_match': {
                        'query': term,
                        'fields': ['title^3', 'artist_name'],
                        'type': 'best_fields',
                        'operator': 'and'
                    }
                }
            }
        }
        s = Search(using=client, index=INDEX_NAME)
        docs = s.query(bool_query)[:count].execute()
        return [SearchResult.from_doc(d) for d in docs]

    else:
        term = " ".join(terms)
        print('else got here ' + term)
        s = Search(using=client, index=INDEX_NAME)
        title_query = {
            'match': {
                'title': {
                    'query': term,
                    'operator': 'and',
                    'fuzziness': 'AUTO'
                }
            }
        }
        lyrics_query = {
            'match': {
                'lyrics': {
                    'query': term,
                    'operator': 'and',
                    'fuzziness': 'AUTO'
                }
            }
        }
        artist_query = {
            'match': {
                'artist_name': {
                    'query': term,
                    'operator': 'and',
                    'fuzziness': 'AUTO'
                }
            }
        }
        dis_max_query = {
            'dis_max': {
                'queries': [title_query, artist_query]
            },
            "tie-breaker": 0.5
        }

        docs = s.query(dis_max_query)[:count].execute()

        #print(docs[0].title)

        return [SearchResult.from_doc(d) for d in docs]