def process_Query_Type(self,tokens): boost_params = [] boosts = {"title_sinhala":1,"artist_name":1,"writer_name":1,"music":1,"movie":1,"genre":1,"lyrics":1} additional_tokens =[] for token in tokens: splits = word_splitter.split(token) additional_tokens.append(splits['base']) if(token in views_boosters or splits['affix'] in views_boosters or splits['base'] in views_boosters): boost_params.append("view") if(token in artist_name_boosters or splits['affix'] in artist_name_boosters or splits['base'] in artist_name_boosters): boost_params.append("artist_name") boosts['artist_name'] = 2 if(token in writer_name_boosters or splits['affix'] in writer_name_boosters or splits['base'] in writer_name_boosters): boost_params.append("writer_name") boosts['writer_name'] = 2 if(token in music_boosters or splits['affix'] in music_boosters or splits['base'] in music_boosters): boost_params.append("music") boosts['music'] = 2 if(token in movie_boosters or splits['affix'] in movie_boosters or splits['base'] in movie_boosters): boost_params.append("movie") boosts['movie'] = 2 if(token in genre_boosters or splits['affix'] in genre_boosters or splits['base'] in genre_boosters): boost_params.append("genre") boosts['genre'] = 2 query_mod = " ".join(tokens+additional_tokens) return set(boost_params),boosts,query_mod
def predictQType(self,tokens): boost_params = [] boosts = {"title_si":1,"artist":1,"writer":1,"music":1,"genre":1.0,"lyrics":1} additional_tokens =[] print(tokens) for token in tokens: #split the tokens to identify affixes splits = word_splitter.split(token) additional_tokens.append(splits['base']) #add base to query depending on the threshold if(token in rating_boosters or splits['affix'] in rating_boosters or splits['base'] in rating_boosters): boost_params.append("rate") if(token in artist_boosters or splits['affix'] in artist_boosters or splits['base'] in artist_boosters): boost_params.append("artist") boosts['artist'] = 2 if(token in writer_boosters or splits['affix'] in writer_boosters or splits['base'] in writer_boosters): boost_params.append("writer") boosts['writer'] = 2 if(token in music_boosters or splits['affix'] in music_boosters or splits['base'] in music_boosters): boost_params.append("music") boosts['music'] = 2 if(token in genre_boosters or splits['affix'] in genre_boosters or splits['base'] in genre_boosters): boost_params.append("genre") boosts['genre'] = 2 #append music as well. query_mod = " ".join(tokens+additional_tokens) return set(boost_params),boosts,query_mod
def process_word(sentence): raw_list = sentence.split() temp_list = [] for raw_word in raw_list: if len(raw_word) < 10: temp_list.append(raw_word) continue result = word_splitter.split(raw_word) temp_list.append(result['base']) temp_list.append(result['affix']) final_query = " " for piece in temp_list: final_query = final_query + " " + piece return temp_list, final_query
def create_orig_to_base_json(lines: list): # lines is a list of sentences from sinling import word_splitter as ws unique_file_name = str(uuid.uuid4()) + ".json" output_dict = dict() for sentence in lines: words = sentence.split() for word in words: # word = word.replace('\u200d', '') if is_sinhala_word(word): try: base = ws.split(word)['base'] print(word, base) if word not in output_dict: output_dict[word] = base except Exception as e: print("word:", word) print(e, "\n") with open(unique_file_name, 'w', encoding='utf8') as outfile: json.dump(output_dict, outfile, ensure_ascii=False)
def identifyContext(self, tokens, query): boosting_fields = [] boosting_data = {} print(tokens) for token in tokens: results = word_splitter.split(token) #Split the token into affix and the base if (results['affix'] == "ගේ"): query = query.replace(token, results['base']) if ("ගීත" in token): query = query.replace("ගීත", " ") #print(results, token) if (token in rating_identifiers or results['affix'] in rating_identifiers or results['base'] in rating_identifiers): boosting_fields.append("rate") print(token) query = self.replaceUnwantedData( query, [token, results['base'], results['affix']]) if (token in artist_identifiers or results['affix'] in artist_identifiers or results['base'] in artist_identifiers): boosting_fields.append("artist") boosting_data['artist'] = 2.0 query = self.replaceUnwantedData(query, [results['affix']]) if (token in writer_identifiers or results['affix'] in writer_identifiers or results['base'] in writer_identifiers): boosting_fields.append("writer") boosting_data['writer'] = 2.0 query = self.replaceUnwantedData( query, [token, results['base'], results['affix']]) if (token in genre_boosters or results['affix'] in genre_boosters or results['base'] in genre_boosters): boosting_fields.append("genre") boosting_data['genre'] = 3.0 #append music as well. return list(set(boosting_fields)), boosting_data, query
def get_base(word): return word_splitter.split(word)['base']
def split(): query_request = request.json print(query_request) results = word_splitter.split(query_request["word"]) return results
def query_process(query): artist_flag = False splited_sentence = tokenizer.tokenize(query) print(splited_sentence) for word in splited_sentence: if word in artist_consider: artist_flag = True break else: print(word_splitter.split(word)) affix = word_splitter.split(word)['affix'] base = word_splitter.split(word)['base'] print(affix) if affix in artist_consider: artist_flag = True query = query + ' ' + base break print(artist_flag) if artist_flag: body = { "query": { "multi_match": { "type": "most_fields", "query": query, "fields": [ "artist^3.0", "lyrics^1.0", "title^1.0", "musicArtist^1.0", "lyricsArtist^1.0", "genre^1.0", "movie^1.0" ] } }, "aggs": { "rate_range": { "range": { "field": "rate", "ranges": [{ "from": 0, "to": 1000 }, { "from": 1000, "to": 2000 }, { "from": 2000, "to": 3000 }, { "from": 3000, "to": 4000 }, { "from": 4000, "to": 5000 }, { "from": 5000, "to": 6000 }, { "from": 6000, "to": 7000 }, { "from": 7000, "to": 8000 }, { "from": 8000, "to": 9000 }, { "from": 9000, "to": 10000 }] } } } } else: body = { "query": { "multi_match": { "type": "most_fields", "query": query, "fields": [ "artist", "lyrics", "title", "musicArtist", "lyricsArtist", "genre", "movie" ] } }, "aggs": { "rate_range": { "range": { "field": "rate", "ranges": [{ "from": 0, "to": 1000 }, { "from": 1000, "to": 2000 }, { "from": 2000, "to": 3000 }, { "from": 3000, "to": 4000 }, { "from": 4000, "to": 5000 }, { "from": 5000, "to": 6000 }, { "from": 6000, "to": 7000 }, { "from": 7000, "to": 8000 }, { "from": 8000, "to": 9000 }, { "from": 9000, "to": 10000 }] } } } } return body
def search(): boosting_list=["name"] query=[] is_rating_query=False numeric_value=0 query_request= request.form["query"].strip().lower() processed_query_request="" token_list=tokenizer.tokenize(query_request) # boosting_list[1] = ["artist^3" if ("ගේ" in artist) else "artist" for artist in token_list][0] # boosting_list[2] = ["genere^3" if (genere in token_list) else "genere" for genere in genere_list][0] if(any(x in genere_list for x in token_list)): boosting_list.append("genere^2") if(any(x in lyrics_by_list for x in token_list)): boosting_list.append("lyrics by^2") if(any(x in music_by_list for x in token_list)): boosting_list.append("music by^2") if(any(x in key_list for x in token_list)): boosting_list.append("key^2") if(any(("ගේ" in x or x in artist_list) for x in token_list)): boosting_list.append("artist^2") if(any(x in popular_list for x in token_list)): token_list = [i for i in token_list if i not in popular_list] is_rating_query= True if(any(x.isnumeric() for x in token_list)): for x in token_list: if (x.isnumeric()): print(x) token_list.remove(x) numeric_value=int(x) # = [ if (lyrics_by in token_list) else "lyrics by"for lyrics_by in lyrics_by_list][0] # boosting_list[4] = ["music by^3" if (music_by in token_list) else "music by"for music_by in music_by_list][0] # boosting_list[5] = ["key^3" if (music_by in token_list) else "key"for music_by in music_by_list][0] # affix=word_splitter.split(query_request)["affix"] processed_query_request =" ".join([word_splitter.split(item)["base"] if len(item)>5 else item for item in token_list]) # result = es.search(index="lyrics", doc_type="doc",body={ "query": {"match" : { "genere": affix}}}) boosting_list=list(dict.fromkeys(boosting_list)) print(numeric_value) print(query_request) print(token_list) print(processed_query_request) print(boosting_list) print(is_rating_query) if(len(processed_query_request)==0): query= {"query" : { "match_all" :{} } } else: query= {"query" : { "multi_match" : { "query" : processed_query_request, "fields" : boosting_list } } } # if(len(boosting_list)==0): # boosting_list=["name", "artist","genere","lyrics by","music by","key","lyrics" ] if(is_rating_query): query["sort"]=[{'views':'desc'}] if(numeric_value !=0): query["size"] = numeric_value boosted_query= es.search(index="lyrics", doc_type="doc",body=json.dumps(query)) hits = boosted_query["hits"]["hits"] if(len(hits)==0): return render_template('ui.html', result = "No search result exists") lyrcs_list=[lyrics["_source"] for lyrics in hits ] return render_template('ui.html', results = lyrcs_list)
sys.path.append("/home/basa/sinling/Sinling") from sinling import word_splitter filepath = "new.txt" a = {} b = {} with open(filepath, encoding="utf8") as fp: for line in fp: x = line.strip().split(' ') for j in x: a[j] = 0 for i in a: results = word_splitter.split(i) b[results['base']] = 0 f = open('roots.txt', 'w', encoding="utf8") #print (a) for i in b: f.write(i + "\n") #print (results) f.close() #word = "බළ්ලාට" #results = word_splitter.split(word) #print (results['base'])