Exemplo n.º 1
0
def sentence_to_word(sentence, lang):

    if(lang=='en'):
        words = nltk.word_tokenize(sentence)
        return [e.lower() for e in words]
    else:
        tokenizer = SinhalaTokenizer()

        return tokenizer.tokenize(sentence)
Exemplo n.º 2
0
def get_sn_process_setup():
    """
    Returns a tokenizer and stemmer object for sinhala language
    """

    tokenizer = SinhalaTokenizer()
    stemmer = sinhalaStemmer.stemmer()

    return tokenizer, stemmer
from elasticsearch import Elasticsearch
from sinling import SinhalaTokenizer
from sinling import word_splitter

es = Elasticsearch()
tokenizer = SinhalaTokenizer()
artist_name_boosters = [
    'ගේ', 'කීව', 'කී', 'ගායනා කරන', 'ගයන', 'ගායනා', '‌ගේ', 'හඩින්', 'කියනා',
    'කිව්ව', 'කිව්', 'කිව', 'ගායනය', 'ගායනා කළා', 'ගායනා කල', 'ගැයූ'
]
writer_name_boosters = [
    'ලියා', 'ලියූ', 'ලිව්ව', 'ලිව්', 'රචනා', 'ලියා ඇති', 'රචිත', 'ලියන ලද',
    'ලියන', 'හදපු', 'පද', 'රචනය', 'හැදූ', 'හැදුව', 'ලියන', 'ලියන්න', 'ලීව',
    'ලියපු', 'ලියා ඇත', 'ලිඛිත'
]
music_boosters = ["වාදනය", "සංගීතය", "නාද"]
movie_boosters = ['චිත්‍රපට', 'සිනමා']
genre_boosters = [
    'දේවානුභාවයෙ', 'පැරණි', 'පොප්ස්', 'පොප්', 'පරණ', 'ක්ලැසික්', 'ක්ලැසි',
    'ඉල්ලීම', 'චිත්‍රපට', 'නව', 'වර්ගයේ', 'අයත්', 'වර්ගයට', 'කණ්ඩායම්', 'යුගල'
]
views_boosters = [
    'සුපිරි', 'නියම', 'ප්‍රමුඛතම', 'පට්ට', 'ඉහළම', 'ගෝල්ඩන්', 'හොඳ', 'හොඳම',
    'එලකිරි', 'එළකිරි', 'සුප්පර්', 'සුප්රකට', 'ඉහල', 'වැඩිපුර', 'වැඩිපුරම',
    'සුප්‍රකට', 'ජනප්රිය', 'ජනප්රියම', 'ජනප්‍රිය', 'ජනප්‍රියම', 'ප්‍රකට',
    'ප්‍රසිද්ධ'
]
boosts_default = {
    "title_sinhala": 1,
    "artist_name": 1,
    "writer_name": 1,
Exemplo n.º 4
0
def get_sn_process_setup():

    tokenizer = SinhalaTokenizer()
    stemmer = sinhalaStemmer.stemmer()

    return tokenizer, stemmer
 def __init__(self):
     self.tokenizer = SinhalaTokenizer()
     self.es = Elasticsearch()
     self.index = "160376l-ssb-data-2020-modified-index7"
     self.translation_dict = {}
Exemplo n.º 6
0
 def __init__(self):
     self.__tokenizer = SinhalaTokenizer()
     self.__tagger = POSTagger()
     self.__stemming = Stemming()
Exemplo n.º 7
0
def search(term: str, count: int) -> List[SearchResult]:
    client = Elasticsearch()

    # Elasticsearch 6 requires the content-type header to be set, and this is
    # not included by default in the current version of elasticsearch-py
    client.transport.connection_pool.connection.headers.update(HEADERS)

    tokenizer = SinhalaTokenizer()

    terms = tokenizer.tokenize(term)

    print(terms)

    if (term == "songs" or terms == []):
        s = Search(using=client, index=INDEX_NAME)
        docs = s.query({"bool": {
            "must": [{
                "match_all": {}
            }]
        }})[:count].execute()
        return [SearchResult.from_doc(d) for d in docs]

    if ('top' in term and ('songs' in term or 'artist' in term)):
        if ('songs' in term):
            bool_query = {
                'bool': {
                    'must': {
                        'range': {
                            'track_rating.sort': {
                                'gte': 0
                            }
                        }
                    },
                }
            }
            s = Search(using=client, index="tokenized")
            docs = s.query(bool_query)[:count].sort(
                '-track_rating.sort').execute()
            return [SearchResult.from_doc(d) for d in docs]

        if ('artis' in term):
            bool_query = {
                'bool': {
                    'must': {
                        'range': {
                            'artist_rating': {
                                'gte': 0
                            }
                        }
                    },
                }
            }
            s = Search(using=client, index="tokenized")
            docs = s.query(bool_query)[:count].sort(
                '-artist_rating.sort').execute()
            return [SearchResult.from_doc(d) for d in docs]

    elif ('artist' in terms and ':' in terms):
        terms.remove('artist')
        terms.remove(':')
        term = " ".join(terms)
        print('artist got here ' + term)
        bool_query = {
            'bool': {
                'must': {
                    'match': {
                        'artist_name': {
                            'query': term,
                            'operator': 'and',
                            'fuzziness': 'AUTO'
                        }
                    }
                },
                'should': {
                    'multi_match': {
                        'query': term,
                        'fields': ['title^2', 'lyrics'],
                        'type': 'best_fields',
                        'operator': 'or'
                    }
                }
            }
        }
        s = Search(using=client, index="tokenized")
        docs = s.query(bool_query)[:count].execute()
        return [SearchResult.from_doc(d) for d in docs]

    elif ('lyrics' in terms and ':' in terms):
        terms.remove('lyrics')
        terms.remove(':')
        term = " ".join(terms)
        print('lyrics got here ' + term)
        bool_query = {
            'bool': {
                'must': {
                    'match': {
                        'lyrics': {
                            'query': term,
                            'operator': 'and',
                            'fuzziness': '2'
                        }
                    }
                },
                'should': {
                    'multi_match': {
                        'query': term,
                        'fields': ['title^3', 'artist_name'],
                        'type': 'best_fields',
                        'operator': 'and'
                    }
                }
            }
        }
        s = Search(using=client, index=INDEX_NAME)
        docs = s.query(bool_query)[:count].execute()
        return [SearchResult.from_doc(d) for d in docs]

    elif ('album' in terms and ':' in terms):
        terms.remove('album')
        terms.remove(':')
        term = " ".join(terms)
        print('albuns got here ' + term)
        bool_query = {
            'bool': {
                'must': {
                    'match': {
                        'album_name': {
                            'query': term,
                            'operator': 'and',
                            'fuzziness': 'AUTO'
                        }
                    }
                },
                'should': {
                    'multi_match': {
                        'query': term,
                        'fields': ['title^3', 'artist_name'],
                        'type': 'best_fields',
                        'operator': 'and'
                    }
                }
            }
        }
        s = Search(using=client, index=INDEX_NAME)
        docs = s.query(bool_query)[:count].execute()
        return [SearchResult.from_doc(d) for d in docs]

    else:
        term = " ".join(terms)
        print('else got here ' + term)
        s = Search(using=client, index=INDEX_NAME)
        title_query = {
            'match': {
                'title': {
                    'query': term,
                    'operator': 'and',
                    'fuzziness': 'AUTO'
                }
            }
        }
        lyrics_query = {
            'match': {
                'lyrics': {
                    'query': term,
                    'operator': 'and',
                    'fuzziness': 'AUTO'
                }
            }
        }
        artist_query = {
            'match': {
                'artist_name': {
                    'query': term,
                    'operator': 'and',
                    'fuzziness': 'AUTO'
                }
            }
        }
        dis_max_query = {
            'dis_max': {
                'queries': [title_query, artist_query]
            },
            "tie-breaker": 0.5
        }

        docs = s.query(dis_max_query)[:count].execute()

        #print(docs[0].title)

        return [SearchResult.from_doc(d) for d in docs]