def check(self, word, language=None): word=word.strip() if word == "": return None #If it is a number, don't do spelcheck if silpautils.is_number(word): return True if self.lang != language: self.NWORDS = None if language == None : self.lang = detect_lang(word)[word] else : self.lang = language if word=="": return True if self.NWORDS == None: self.NWORDS = self.get_wordlist(word) if self.NWORDS == None: # Dictionary not found return False result = word in self.NWORDS #if it is english word, try converting the first letter to lower case. #This will happen if the word is first word of a sentence if result == False and word.upper() != word.lower(): newword = word[0].lower()+word[1:] self.NWORDS = self.get_wordlist(newword) return newword in self.NWORDS else: return result
def suggest(self,word, language=None, distance=2): word=word.strip() if word=="": return None if self.lang != language: self.NWORDS = None if language==None : self.lang = detect_lang(word)[word] else : self.lang = language if self.NWORDS == None: self.NWORDS = self.get_wordlist(word) if word in self.NWORDS: return word candidates = [] for candidate in self.NWORDS: #skip if the first letter is different #if candidate[0] != word[0]: # continue #if the length difference is greater than the threshold distance, skip if len(candidate) - len(word) > distance or len(word) - len(candidate) > distance : continue if not self.levenshtein(candidate, word) > distance : candidates.append(candidate) candidates = self.filter_candidates(word, candidates) if len(candidates)==0: #try inserting spaces in between the letters to see if the word got merged pos = 2; while pos < len(word)-2: if self.check(word[:pos],self.lang) and self.check(word[pos:],self.lang): candidates.append(word[:pos]+" "+word[pos:]) candidates.append(word[:pos]+"-"+word[pos:]) pos+=1 return candidates
def suggest(self, word, language=None, distance=2): word = word.strip() if word == "": return None if self.lang != language: self.NWORDS = None if language == None : self.lang = detect_lang(word)[word] else : self.lang = language if self.NWORDS == None: self.NWORDS = self.get_wordlist(word) if word in self.NWORDS: return word candidates = [] for candidate in self.NWORDS: # skip if the first letter is different # if candidate[0] != word[0]: # continue # if the length difference is greater than the threshold distance, skip if len(candidate) - len(word) > distance or len(word) - len(candidate) > distance : continue if not self.levenshtein(candidate, word) > distance : candidates.append(candidate) candidates = self.filter_candidates(word, candidates) if len(candidates) == 0: # try inserting spaces in between the letters to see if the word got merged pos = 2; while pos < len(word) - 2: if self.check(word[:pos], self.lang) and self.check(word[pos:], self.lang): candidates.append(word[:pos] + " " + word[pos:]) candidates.append(word[:pos] + "-" + word[pos:]) pos += 1 return candidates
def check(self, word, language=None): word = word.strip() if word == "": return None # If it is a number, don't do spelcheck if silpautils.is_number(word): return True if self.lang != language: self.NWORDS = None if language == None : self.lang = detect_lang(word)[word] else : self.lang = language if word == "": return True if self.NWORDS == None: self.NWORDS = self.get_wordlist(word) if self.NWORDS == None: # Dictionary not found return False result = word in self.NWORDS # if it is english word, try converting the first letter to lower case. # This will happen if the word is first word of a sentence if result == False and word.upper() != word.lower(): newword = word[0].lower() + word[1:] self.NWORDS = self.get_wordlist(newword) return newword in self.NWORDS else: return result
# print(author) for poem_link in poem_links: sleep(1) if ctrlc: quit() else: link2, title = poem_link.xpath('./@href')[0], poem_link.xpath( './@title')[0] data['title'] = title poem_page = requests.get(website + link2) poem = html.fromstring( poem_page.content).xpath('//div[@class="poem"]/p/text()') poem_str = "".join(poem).strip() if poem_str: data['text'] = poem_str lang = detect_lang(poem_str) if lang == "ne": data['lang'] = "नेपाली" else: with open(os.path.join(dir, "undetected.txt"), "a") as f: f.write(lang + "\n") f.write(poem_str + "\n") continue with open( os.path.join(dir, title.split('/')[0]) + '.txt', "w") as f: f.write(json.dumps(data))
continue content = sahitya_content.xpath( '//div[@class="entry-content"]/p') author = content[0].xpath('//strong/text()')[0] data['author'] = author content_str = ''.join( map( lambda x: html.tostring( x, encoding='unicode', pretty_print=True), content[1:])) data['text'] = content_str if sahitya_type in lang_map: data['lang'] = lang_map[sahitya_type] else: try: lang = detect_lang(content_str) except lang_detect_exception.LangDetectException: print("error caught") continue except Exception: print("Exception") continue if lang == 'ne': data['lang'] = "नेपाली" elif lang == 'hi': data['lang'] = "हिन्दी" # elif lang == 'Unknown': # data['lang'] = lang else: continue