D = {} for line in Word: for w in line: for x in line: if x == w: continue #print(x,w) if w not in D: D[w] = {} if x not in D[w]: D[w][x] = 0 D[w][x] = D[w][x] + 1 dct = enchant.Dict("en_US") #off = ["pcfg_dict_correlation_scores.txt", "pcfg_dict_num_correlation_scores.txt", "pcfg_ipv4_correlation_scores.txt" , "pcfg_ipv4_num_correlation_scores.txt", "srizbi_correlation_scores.txt", "torpig_correlation_scores.txt", "zeus_correlation_scores.txt", "kraken_correlation_scores.txt", "DNL1_correlation_scores.txt", "DNL2_correlation_scores.txt", "DNL3_correlation_scores.txt", "DNL4_correlation_scores.txt", "500KL1_correlation_scores.txt", "500KL2_correlation_scores.txt", "500KL3_correlation_scores.txt", "9ML1_correlation_scores.txt"]; #iff = ["pcfg_dict.txt", "pcfg_dict_num.txt", "pcfg_ipv4.txt" , "pcfg_ipv4_num.txt", "srizbi.txt", "torpig.txt", "zeus.txt", "kraken.txt", "DNL1.txt", "DNL2.txt", "DNL3.txt", "DNL4.txt", "500KL1.txt", "500KL2.txt", "500KL3.txt", "9ML1.txt"]; off = ["benign_correlation_scores.txt"] iff = ["benign.txt"] for it in range(0, 1): bad_repo = {} model_data = pickle.load(open('gib_model.pki', 'rb')) dct = enchant.Dict("en_US") outF = open(off[it], "w") with open(iff[it]) as f: # change file name lines = f.readlines() count = 0
import enchant dictio = enchant.Dict("en_US") alph = "abcdefghijklmnopqrstuvwxyz" file = open( "C:/Users/cdobb/AppData/Local/Programs/Python/Python38-32/Lib/site-packages/enchant/data/mingw32/share/enchant/hunspell/en_US.dic" ) for line in file: if "/" in line: line = line[0:(line.index("/"))] line = line.lower() if len(line) == 5 and "rn" in line: temp = "" skip = line.index("rn") for i in range(0, len(line)): if i != skip and i != skip + 1: temp += line[i] if i == skip: temp += "m" if dictio.check(temp): print(line + ", " + temp) file.close()
#!/usr/bin/python import enchant import itertools dictionary = enchant.Dict("en_US") def spellcheck(string): return dictionary.check(string) def generator(letters, length): permutation = list(itertools.permutations(letters, length)) for l in permutation: word = '' for c in l: word += c if (spellcheck(word)): print word def main(): string = raw_input("Enter all the letters\n") length = int(input("Enter length of the word\n")) generator(string, length) if __name__ == "__main__": main()
#!/usr/bin/env python import enchant import sys englishDict = enchant.Dict('en_US') def checkWord(word): if englishDict.check(word): check = True else: check = False return (check) def spellCheck(sentence, tolerance): words = [word for word in sentence.split(' ') if len(word) > 1] faults = [not checkWord(word) for word in words if len(word) != 0] if sum(faults) > tolerance: valid = False else: valid = True return (valid) ##################################### # Very strict sentence cleaning # currently removes ALL sentences will spelling errors(!)
def clean_the_text(text, remove_numbers=False): print('\n', '@' * 75, '\n', 'CLEANING THE TEXT', '\n\n') from bs4 import BeautifulSoup soup = BeautifulSoup(text, 'lxml') # print('PRETTYING UP THE TEXT IN THE CLEANING: ', '\n\t', soup.prettify()) # text = soup.text from pattern.web import URL, plaintext text = plaintext(text, keep=[], linebreaks=2, indentation=False) import unicodedata text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode( 'utf-8', 'ignore') import re clean = re.compile('<.*?>}{') text = re.sub(clean, '', text) text = text.replace('b"', '') text = text.replace("b'", '') text = text.replace("\'", "'") text = text.replace('\\n', ' ') text = text.replace('\\xc2\\xae', '') text = text.replace('\n', ' ') text = text.replace('\t', '') text = text.replace('\s+', '') text = text.replace('\r\r\r', '') text = text.replace('\\xc2\\xa9 ', '') text = text.replace('\\xe2\\x80\\x9c', '') text = text.replace('xe2x80x93', ',') text = text.replace('\\x0c', '') text = text.replace('\\xe2\\x80\\x9d', '') text = text.replace('\\xe2\\x80\\x90', '') text = text.replace('\\xe2\\x80\\x9331', '') text = text.replace('xe2x80x94', '') text = text.replace('\x0c', ' ') text = text.replace(']', '] ') text = text.replace('\\xe2\\x80\\x99', "'") text = text.replace('xe2x80x99', "'") text = text.replace('\\xe2\\x80\\x933', '') text = text.replace('\\xe2\\x80\\x935', '') text = text.replace('\\xef\\x82\\xb7', '') text = text.replace('\\', '') text = text.replace('xe2x80x99', "") text = text.replace('xe2x80x9cwexe2x80x9d', '') text = text.replace('xe2x80x93', ', ') text = text.replace('xe2x80x9cEUxe2x80x9d', '') text = text.replace('xe2x80x9cxe2x80x9d', '') text = text.replace('xe2x80x9cAvastxe2x80x9d', '') text = text.replace('xc2xa0', '') text = text.replace('xe2x80x9cxe2x80x9d', '') text = text.replace('xe2x80x9c', '') text = text.replace('xe2x80x9d', '') text = text.replace('tttttt', ' ') text = text.replace('activetttt.', '') text = text.replace('.sdeUptttt..sdeTogglettttreturn', '') text = text.replace('ttif', '') text = text.replace('.ttt.', ' ') text = text.replace(" t t ", ' ') text = text.replace('tttt ', '') text = text.replace(' tt ', ' ') text = text.replace(' t ', ' ') text = text.replace('ttt', '') text = text.replace('ttr', '') text = text.replace(' >t ', '') text = text.replace('.display', '') text = text.replace('div class', '') text = text.replace('div id', ' ') text = text.replace('Pocy', 'Policy') text = text.replace('xc2xa0a', ' ') text = text.replace(' b ', '') text = text.replace('rrrr', '') text = text.replace('r r r r r ', '') text = text.replace('rtttr', '') text = text.replace(' ', ' ') text = text.replace(' ', ' ') text = text.replace(' ', ' ') text = text.replace(' r ', ' ') text = text.replace(' tr ', ' ') text = text.replace(' rr r ', ' ') text = text.replace('r r r', '') text = text.replace('* t', '* ') text = text.replace('r *', ' *') text = text.replace(' tt t t rt ', ' ') text = text.replace('r rrr r trr ', ' ') text = text.replace(' r t', '') text = text.replace(' r tt', '') text = text.replace(' xe2x80x93 ', ' ') text = text.replace(' xe6xa8x82xe9xbdxa1xe6x9cx83 ', ' ') text = text.replace(' rrr ', ' ') text = text.replace(' rr ', ' ') text = text.replace(' r r ', '') text = text.replace('tr ', '') text = text.replace('* xe7xaex80xe4xbdx93xe4xb8xadxe6x96x87', '') text = text.replace('tt*', '') return text print('*' * 10, 'DROPPING NON-ENGLISH WORDS FROM THE TEXT', '*' * 10) from nltk.tokenize import word_tokenize token_text_w = word_tokenize(text) import enchant d = enchant.Dict('en_US') bad_words = [] for word in token_text_w: if d.check(word) is not True: bad_words.append(word) bad_words = set(bad_words) for word in token_text_w: if word in bad_words: text = text.replace(word, '') #Trial of a new way of cleaning the text index = 0 print('\n\n', '*' * 10, len(tokenize_by_sentences(a)), '*' * 10, '\n\n') for sent in tokenize_by_sentences(a): if 'js' in sent or 'css' in sent or 'png' in sent or 'woff2' in sent or ' div ' in sent or ' meta "" ' in sent or 'span' in sent: a = a.replace(sent, '') print('\n', '*' * 25, '\n', 'CLEANING TOKENIZED SENTENCES OF CODE IN INDEX', index, '*' * 25) index += 1 return (text)
"17", "67" ] def convert_plate(plate): for idx, used in enumerate(nums_used): real_plate = plate[:2] + plate[2:4].replace(used, nums_conv[idx]) + plate[4:] if real_plate != plate: break return real_plate print "Welcome to plate finder (running with " + str(num_cores) + " cores)" dict = enchant.Dict("en_GB") count = 0 found = 0 output = "plates" ext = ".txt" def process_plate(c1): out = open(output + "-" + str(uuid.uuid4()) + ext, "w") for c2 in chars_pre: for c34 in nums_used: for c5 in chars_rnd: for c6 in chars_rnd: for c7 in chars_rnd: plate = c1 + c2 + c34 + c5 + c6 + c7 if dict.check(plate):
from .features import Dictionary, RegexMatches, Stopwords name = "vietnamese" try: import enchant dictionary = enchant.Dict("vi") except enchant.errors.DictNotFoundError: raise ImportError("No enchant-compatible dictionary found for 'vi'. " + "Consider installing 'hunspell-vi'.") dictionary = Dictionary(name + ".dictionary", dictionary.check) """ :class:`~revscoring.languages.features.Dictionary` features via `enchant.Dict <https://github.com/rfk/pyenchant>`_ "vi". Provided by `hunspell-vi`. """ # https://vi.wiktionary.org/wiki/Th%C3%A0nh_vi%C3%AAn:Laurent_Bouvier/ # Free_Vietnamese_Dictionary_Project_Vietnamese-Vietnamese#Allwiki_.28closed.29 stopwords = set([ "ai", "bằng", "bị", "bộ", "cho", "chưa", "chỉ", "cuối", "cuộc", "các", "cách", "cái", "có", "cùng", "cũng", "cạnh", "cả", "cục", "của", "dùng", "dưới", "dừng", "giữa", "gì", "hay", "hoặc", "khi", "khác", "không", "luôn", "là", "làm", "lại", "mà", "mọi", "mỗi", "một", "nhiều", "như", "nhưng", "nào", "này", "nữa", "phải", "qua", "quanh", "quá", "ra", "rất", "sau", "sẽ", "sự", "theo", "thành", "thêm", "thì", "thứ", "trong", "trên", "trước", "trừ", "tuy", "tìm", "từng", "và", "vài", "vào", "vì", "vẫn", "về", "với", "xuống", "đang", "đã", "được", "đấy", "đầu", "đủ" ]) stopwords = Stopwords(name + ".stopwords", stopwords)
import nltk # to remove stopword from nltk.corpus import stopwords from nltk.stem.porter import PorterStemmer from sklearn import linear_model, svm, neighbors, naive_bayes from sklearn.metrics import accuracy_score, f1_score, roc_auc_score from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split # from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import confusion_matrix import enchant SPELLING_DICT = enchant.Dict("en_US") # from nltk.tokenize import sent_tokenize # from nltk import word_tokenize, pos_tag, ne_chunk def load_data(): data = pd.read_csv('deceptive-opinion.csv') data = data.drop(columns="hotel") data = data.drop(columns="source") data = data.drop(columns="polarity") data.rename(columns={'deceptive':'real'}, inplace=True) stop = stopwords.words('english') data['text'] = data['text'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop])) data.loc[data['real'] == 'truthful', 'real'] = 1 data.loc[data['real'] == 'deceptive', 'real'] = 0
from .features import Dictionary, RegexMatches, Stopwords name = "serbian" try: import enchant dictionary = enchant.Dict("sr") except enchant.errors.DictNotFoundError: raise ImportError("No enchant-compatible dictionary found for 'sr'. " + "Consider installing 'hunspell-sr'.") dictionary = Dictionary(name + ".dictionary", dictionary.check) """ :class:`~revscoring.languages.features.Dictionary` features via `enchant.Dict <https://github.com/rfk/pyenchant>`_ "sr". Provided by `hunspell-sr`. """ # https://meta.wikimedia.org/w/index.php?oldid=17213519 stopwords = [ r"административна", r"административног", r"али", r"америчке", r"америчких", r"астрономија", r"база", r"без", r"београд", r"била", r"били", r"било",
def __init__(self): super(Spellcheck, self).__init__() lang = os.environ.get("LC_CTYPE", "en_US.utf-8").split('.')[0] self.dictionary = enchant.Dict(lang) self.print_err("loaded dictionary for {}".format(lang)) self.in_word = False
# import pymorphy2 import enchant, re, nltk, pymorphy2 from nltk.corpus import stopwords dictionary = enchant.Dict("ru_RU") nltk.download('stopwords') morph = pymorphy2.MorphAnalyzer(lang='ru') # print(morph.parse('стали')) # print(morph.parse('стали')[0].normal_form) # import pandas as pd # df = pd.read_csv('data.csv', sep=r'((?:(^\d+));)|(^(?:(ID));)', skiprows=0, index_col=0) # print(df) # print(df)s # print(df.to_string()) # from numpy import genfromtxt # my_data = genfromtxt('data.csv', delimiter=';') stopwords_ru = stopwords.words("russian") data = ['ID\tQuestion\n'] missplells_log = [] words_errors = [] for idx, line in enumerate(open('onlyquestions').readlines()[:10]): # for idx,line in enumerate(open('onlyquestions').rseadlines()): t = line.strip() split_pattern = r'[«]?[а-яА-Я]+[»]?' splitted = re.findall(split_pattern, t) # splitted = t.split()
if w in ['no', 'not']: print(label_class[i] + ': ' + 'no') match = re.search('(\d+%)', s) if match: pct = match.group(1) print(label_class[i] + ': ' + pct) if label_class[i] == 'sale restriction': for w in s.split(): if w in gazetteers.words('countries.txt'): print(label_class[i] + ': ' + w) break if label_class[i] == 'tour code': for j in range(len(s.split())): if s.lower().split()[j] == 'code': w = s.split()[j+1] if not enchant.Dict("en_US").check(w): print(label_class[i] + ': ' + w) if label_class[i] in ['ticketing period', 'travelling period']: w = s.split() nw = [] for j in range(len(w)): # Process case like "RELEASED: DEC 29, 201514-" if w[j].lower() == 'released': if w[j+1].lower() in months or w[j+2].lower() in months: w[j+3] = w[j+3][:4] if w[j+3].isdecimal(): released_date = ' '.join(w[j+1:j+4]) # Process case like "Ticket must be issued on/before31JAN, 2016" if w[j].isalnum() and not w[j].isalpha() and not w[j].isdecimal(): for k, g in groupby(w[j], str.isalpha): nw.append(''.join(list(g)))
def remove_from_dict(dict): d = enchant.Dict("ro_RO") for element in list(dict): d.remove(element)
def add_to_dict(dict): d = enchant.Dict("ro_RO") for i, element in enumerate(list(dict)): if not d.check(element): d.add(element) print('Element', i, 'out of', len(list(dict)))
def all_wordification(number): """Converts a number to a word""" import enchant # English Dictionary # from nltk.corpus import words (Alternate word searcher) from itertools import product # Cartesian Product from itertools import combinations # Combination # Create a dictionary for number to letter conversion alph_num_dict = { '2': ('a', 'b', 'c'), '3': ('d', 'e', 'f'), '4': ('g', 'h', 'i'), '5': ('j', 'k', 'l'), '6': ('m', 'n', 'o'), '7': ('p', 'q', 'r', 's'), '8': ('t', 'u', 'v'), '9': ('w', 'x', 'y', 'z') } number = number[6:] # delete area code number = number[:3] + number[4:] # remove hyphen word = "" all_words = [] d = enchant.Dict("en_US") # check if word is a real english word # Find all combination of numbers via cartesian product # prod = list(product('012', repeat=7)) # Find all combinations of numbers # old cartesian product temp = [] prod = [] for index in range(len(number)): if number[index] in ['9', '7']: temp.append([0, 1, 2, 3]) else: temp.append([0, 1, 2]) for i in product(*temp): prod.append((i)) comb = list(combinations([0, 1, 2, 3, 4, 5, 6, 7], 2)) # Find all inner combination of numbers # print(perm[0][1]) i = 0 for i in range(len(prod)): # iterate through all number combos of word for index in range( len(number) ): # iterate through number and convert each number to a letter p = number[index] word += alph_num_dict[p][int(prod[i][index])] # add letter to word # if d.check(word): # add new words to list # all_words.append(word) for j in comb: # iterate through all words/subwords and check if they are real words in dictionary temp = word[j[0]:j[1]] if d.check(temp): # check if word is an english word # if temp in words.words(): # alternate word searcher temp = number[0:j[0]] + temp + number[j[1]:] all_words.append(temp) word = "" # clear word return (all_words)
def timerun(): a=datetime.datetime.now() print(a-datetime.datetime.now()) timerun() #Exercise 3: Print a Word Provided by the User b=input("Type something here:") print(b) #Exercise 4: Validate User Input #For this I installed the Enchant library using 'pip install pyenchant' import enchant #It looked like only one dictionary at a time was available at least easily. So I used the US dictionary. d = enchant.Dict('en_US') i=1 #I had to initialize i as 1 before the loop. So i=1 would be a global setting needed for this program. Did I need to define it that way? def runit(): global i while i==1: e=input("Type a word here:") print(e) while(d.check(e) is False): e=input("Please type an English word:") print(e) #I had to use an extra if statement here to get the same result that I did in R using recursion. I don't know if that's because I'm doing something wrong here. if(d.check(e) is False): runit() i+=1
try: import xml.etree.cElementTree as ET except ImportError: import xml.etree.ElementTree as ET from xml.dom.minidom import parseString #AUTO-load classifiers #a trick to get the current module _modname = globals()['__name__'] _this_mod = sys.modules[_modname] _ppath = "/".join(_this_mod.__file__.split("/")[:-1]) d = enchant.Dict("en_US") import json # #CAN drop this if this is an app! # DEPLOY_DIR="/home/lentaing/envs/newdc1.4/src" # sys.path.insert(0, DEPLOY_DIR) # from django.core.management import setup_environ # from django.utils.encoding import smart_str # import settings # setup_environ(settings) from django.utils.encoding import smart_str from datacollection import models #dynamically load classifiers #import classifiers import sra
if request.method == "POST": user_string = request.json['text'] # flag stores whether previous translated word was hindi or not flag = request.json['flag'] if user_string: # Call to translate function to process the string predictions = generate_predictions(transliterator_obj, user_string, eng_dict, hin_dict, classifier, flag) return json.dumps({"lists": predictions}) else: # return empty list if user sends empty string return json.dumps({"lists": []}) if __name__ == "__main__": # initializing english and hindi dictionaries eng_dict = enchant.Dict('en_US') hin_dict = enchant.Dict('hi_IN') # initializing object of Transliterator class transliterator_obj = Transliterator(source='eng', target='hin') # initializing object for language classifier classifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) # run flask app app.run('0.0.0.0', debug=True)
def get_all_tweets(screen_name): auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth) alltweets = [] new_tweets = api.user_timeline(screen_name=screen_name, count=200) alltweets.extend(new_tweets) oldest = alltweets[-1].id - 1 while len(new_tweets) > 0: print("getting tweets before %s " % (oldest)) new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest) alltweets.extend(new_tweets) oldest = alltweets[-1].id - 1 print("...%s tweets downloaded so far" % (len(alltweets))) output = list() for i in range(len(alltweets)): analysis = TextBlob(alltweets[i].text) words = alltweets[i].text.split() misspelled = "" vulgar = "" for x in range(len(words)): if words[x] in vulgar_terms.bad_words.keys(): vulgar += words[x] + ", " spellword = words[x] if spellword.startswith("@") == False and spellword.endswith( ",") == False and spellword.startswith( "https://") == False and spellword.startswith( "http://") == False and enchant.Dict( "en_US").check(words[x]) == False: misspelled += words[x] + ", " misspelled = misspelled[:len(misspelled) - 2] vulgar = vulgar[:len(vulgar) - 2] if analysis.sentiment.polarity <= -0.2 and analysis.sentiment.subjectivity >= 0.5: if len(vulgar) != 0 and len(misspelled) != 0: output.append(alltweets[i].id_str + "*" + alltweets[i].text + "*" + str(vulgar) + "*" + str(misspelled) + "*" + str(analysis.sentiment.polarity) + "*" + str(analysis.sentiment.subjectivity)) elif len(vulgar) != 0: output.append(alltweets[i].id_str + "*" + alltweets[i].text + "*" + str(vulgar) + "*" + "N/A" + "*" + str(analysis.sentiment.polarity) + "*" + str(analysis.sentiment.subjectivity)) elif len(misspelled) != 0: output.append(alltweets[i].id_str + "*" + alltweets[i].text + "*" + "N/A" + "*" + str(misspelled) + "*" + str(analysis.sentiment.polarity) + "*" + str(analysis.sentiment.subjectivity)) else: output.append(alltweets[i].id_str + "*" + alltweets[i].text + "*" + "N/A" + "*" + "N/A" + "*" + str(analysis.sentiment.polarity) + "*" + str(analysis.sentiment.subjectivity)) elif len(vulgar) != 0 and len(misspelled) != 0: output.append(alltweets[i].id_str + "*" + alltweets[i].text + "*" + str(vulgar) + "*" + str(misspelled) + "*" + "N/A" + "*" + "N/A") elif len(vulgar) != 0: output.append(alltweets[i].id_str + "*" + alltweets[i].text + "*" + str(vulgar) + "*" + "N/A" + "*" + "N/A" + "*" + "N/A") elif len(misspelled) != 0: output.append(alltweets[i].id_str + "*" + alltweets[i].text + "*" + "N/A" + "*" + str(misspelled) + "*" + "N/A" + "*" + "N/A") for i in range(len(output)): print(output[i]) # for i in range(len(output)): # print(output[i]) return output
def __init__(self, dict_name='en', max_dist=2): self.spell_dict = enchant.Dict(dict_name) self.max_dist = max_dist
from .features import Dictionary, RegexMatches, Stemmed, Stopwords name = "french" try: import enchant dictionary = enchant.Dict("fr") except enchant.errors.DictNotFoundError: raise ImportError("No enchant-compatible dictionary found for 'fr'. " + "Consider installing 'myspell-fr'.") dictionary = Dictionary(name + ".dictionary", dictionary.check) """ :class:`~revscoring.languages.features.Dictionary` features via `enchant.Dict <https://github.com/rfk/pyenchant>`_ "fr". Provided by `myspell-fr` """ try: from nltk.corpus import stopwords as nltk_stopwords stopwords = set(nltk_stopwords.words('french') + ["a"]) except LookupError: raise ImportError("Could not load stopwords for {0}. ".format(__name__) + "You may need to install the nltk 'stopwords' " + "corpora. See http://www.nltk.org/data.html") stopwords = Stopwords(name + ".stopwords", stopwords) """ :class:`~revscoring.languages.features.Stopwords` features provided by `nltk.corpus.stopwords <https://www.nltk.org/api/nltk.corpus.html>`_ "french" """
from tqdm import tqdm_notebook from scipy.stats import rankdata from itertools import product import enchant from pymystem3 import Mystem import numpy as np import pandas as pd import re import gensim from sklearn.decomposition import PCA from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer enchant_d = enchant.Dict("ru") mystem = SnowballStemmer("russian") mystem_e = SnowballStemmer("english") russian_stopwords = stopwords.words("russian") english_stopwords = stopwords.words("english") tokenizer = RegexpTokenizer(r'\w+') def preprocess_text(text, stemmer_I): text = text.lower() #text = ' '.join(tokenizer.tokenize(text)) if stemmer_I: tokens = [mystem.stem(mystem_e.stem(token)) for token in re.sub('[^a-zа-я0-9]', ' ', text).split() if not token in russian_stopwords \ and not token in english_stopwords \ #and not token.isdigit() \
from utils.text_utils import tokenize, get_homophones_by_char, get_homophones_by_pinyin traditional_sentence = '憂郁的臺灣烏龜' simplified_sentence = traditional2simplified(traditional_sentence) print(simplified_sentence) simplified_sentence = '忧郁的台湾乌龟' traditional_sentence = simplified2traditional(simplified_sentence) print(traditional_sentence) print(lazy_pinyin('中心')) # 不带音调 print(tokenize('小姑娘蹦蹦跳跳的去了她外公家')) # 判断拼音还是英文 en_dict = enchant.Dict("en_US") print(en_dict.check("hello")) print(en_dict.check("hello boy what is your name")) strs = "hello boy what is your name" flag = False for word in strs: if en_dict.check(word): flag = True else: flag = False break print(flag) print(en_dict.check("zhangsan")) print(en_dict.check("zhangsan ni zai zhe li ma ?")) pron = get_homophones_by_char('长')
def Classification(filename, srcTest, labelDict, TotalSpamHam, k): matrixConfussion = defaultdict(int) predictClass = dict() kamusKataSpam = defaultdict(int) kamusKataHam = defaultdict(int) kamusKata = list() #openfile with open(filename, 'r') as csvfile: reader = csv.DictReader(csvfile) for row in reader: key = row['word'] kamusKata.append(key) if key not in kamusKataSpam: if (row['spam'] != '0'): kamusKataSpam[key] = row['spam'] if key not in kamusKataHam: if (row['ham'] != '0'): kamusKataHam[key] = row['ham'] totalSpam = TotalSpamHam[0] totalHam = TotalSpamHam[1] probSpam = float(totalSpam + k) / (totalSpam + totalHam + len(totalSpamHam)) probHam = float(totalHam + k) / (totalSpam + totalHam + len(totalSpamHam)) tempprobSpam = copy.copy(probSpam) tempprobHam = copy.copy(probHam) kataDataUji = defaultdict(int) stopWords = set(stopwords.words('english')) d = enchant.Dict("en_US") lemma = nltk.wordnet.WordNetLemmatizer() files = os.listdir(srcTest) for file in files: fp = open(srcTest + file, 'r').read() listKata = fp.split(" ") for kata in listKata: kata = kata.lower() if kata not in stopWords: # hilangin stopword kata = lemma.lemmatize(kata) # stemming if kata != '': if not d.check(kata): suggest = d.suggest(kata) if len(suggest) != 0: kata = suggest[0] kata = kata.lower() kataDataUji[kata] += 1 else: kataDataUji[kata] += 1 for key in kataDataUji.keys(): tempprobSpam *= (kataDataUji[key] * ((float(kamusKataSpam[key]) + k) / (len(kamusKataSpam) + len(kamusKata)))) tempprobHam *= (kataDataUji[key] * ((float(kamusKataHam[key]) + k) / (len(kamusKataHam) + len(kamusKata)))) if tempprobSpam < tempprobHam: predictClass[file] = '1' elif tempprobSpam >= tempprobHam: predictClass[file] = '0' tempprobSpam = copy.copy(probSpam) tempprobHam = copy.copy(probHam) kataDataUji.clear() if predictClass[file] == labelDict[file] and predictClass[file] == '0': matrixConfussion['TP'] += 1 elif predictClass[file] == '1' and labelDict[file] == '0': matrixConfussion['FN'] += 1 elif predictClass[file] == '0' and labelDict[file] == '1': matrixConfussion['FP'] += 1 elif predictClass[file] == labelDict[file] and predictClass[ file] == '1': matrixConfussion['TN'] += 1 return (predictClass, matrixConfussion)
# -*- coding: utf-8 -*- import xml.etree.ElementTree as et import re import enchant from enchant.checker import SpellChecker d = enchant.Dict("de_CH") filename = "/Users/Simon/UNI VII/bigdata/nzz/NZZ_1910_1920-with-uuid/1910-01/JM20121222000301997.xml" # filename = "/Users/tabris/Downloads/NZZ_1910_1920-with-uuid/1910-08/JM20121222000281742.xml" tree = et.parse(filename) root = tree.getroot() content = root.find('TX') clist = [] craw = [] letter_ratio = [] lengths = [] #loop through xml file's p tags for child in content.findall('P'): craw.append(unicode(child.text)) #try to put content int0 string # NICO: use the unicode class, which offers pretty much all the functions of the # string class, but does not save them as 8-bit strings try: text = unicode(child.text) except: clist.append('shit_encoding') continue
from sklearn.metrics import mean_squared_error from sklearn.linear_model import LinearRegression from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split from sklearn.compose import TransformedTargetRegressor from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords from collections import Counter, defaultdict hidden_dim = 128 all_stopwords = stopwords.words('english') wordnet_lemmatizer = WordNetLemmatizer() english_dict = enchant.Dict("en_US") def remove_digits(s): return ''.join([i for i in s if not i.isdigit()]) def lemmatize(s): return wordnet_lemmatizer.lemmatize(s.lower().encode( "ascii", "ignore").decode("utf-8")) def clean(s): tmp_words = s.lower().split(" ") tmp_words = [ word.translate(str.maketrans('', '', string.punctuation))
def setTrainingVars(self, P, corp, num_topics, NTest, NTrain, lapp="", includeLabels=False): self.includeLabels = includeLabels self.T = NTest self.TRAIN = NTrain self.corpus = corp self.dfs = self.corpus.dfs() self.K = num_topics loc = lapp + "exports/" + P + "/lda_states/ldapy" + str(self.K) self.lda = models.ldamodel.LdaModel.load(loc) for z in range(0, self.K): topic = self.lda.state.get_lambda()[z] topic = topic / topic.sum() bestn = matutils.argsort(topic, 100, reverse=True) terms = [(id, topic[id]) for id in bestn] #terms = lda.get_topic_terms(z,100) for term in terms: word = corp.dictionary[term[0]].lower() weight = term[1] occurences = self.dfs[term[0]] #idf = log(corpus.documentCount/(1+occurences)) if word in self.wordweights: if weight > self.wordweights[word]: self.wordweights[word] = weight #* idf else: self.wordweights[word] = weight #* idf #print('\n\n') with open(lapp + "exports/" + P + "/good_ADJ.txt", "r") as f: for line in f: self.good_adjs.append(line.strip()) with open(lapp + "exports/" + P + "/bad_ADJ.txt", "r") as f: for line in f: self.bad_adjs.append(line.strip()) with open(lapp + "exports/" + P + "/good_NOUN.txt", "r") as f: for line in f: self.good_verbs.append(line.strip()) with open(lapp + "exports/" + P + "/bad_NOUN.txt", "r") as f: for line in f: self.bad_verbs.append(line.strip()) with open(lapp + "exports/" + P + "/featuresAprioriLexicalPruned.txt", "r") as f: for line in f: self.product_features.append(line.strip()) with open(lapp + "inputs/badwords.txt", "r") as f: for line in f: self.bad_words.append(line.decode('utf-8').strip()) self.currentGenerator = NTrain * 2 self.nnn = NTrain * 2 self.nlp = English() self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') self.d = enchant.Dict("en_US")
########################################################################################### # Maps a given list of company names to their website domain names # Add downweighting for companies with non-www starting ########################################################################################### import urllib import json as m_json from urlparse import urlparse import enchant import testData import sys URL_COUNT_WEIGHT = .25 URL_ORDER_WEIGHT = -.25 URL_LEN_WEIGHT = -.1 ENGLISH_DICT = enchant.Dict("en_US") TRIVIAL_WORDS = ["company", "inc", "group", "corporation", "co", "corp", "university", "college", "&", "llc", "the", "of", "a", "an"] # Code adapted from http://stackoverflow.com/questions/3898574/google-search-using-python-script # # Assume Q is a list of unique strings def getURLForQuery(q, query2URLS): query = urllib.urlencode ( { 'q' : q } ) response = urllib.urlopen ( 'http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read() json = m_json.loads ( response ) results = json [ 'responseData' ] [ 'results' ] URLS = [] for result in results: title = result['title'] url = result['url'] # was URL in the original and that threw a name error exception URLS.append(url) query2URLS[q] = URLS
import sys import enchant print(80 * '-') print('PYTHONPATH: %s' % sys.path) # At least one backend should be available backends = [x.name for x in enchant.Broker().describe()] if len(backends) < 1: raise SystemExit('Error: No dictionary backend available') print(80 * '-') print('Backends: ' + ', '.join(backends)) # Usually en_US dictionary should be bundled. langs = enchant.list_languages() dicts = [x[0] for x in enchant.list_dicts()] if len(dicts) < 1: raise SystemExit('No dictionary available') print(80 * '-') print('Languages: %s' % ', '.join(langs)) print('Dictionaries: %s' % dicts) print(80 * '-') # Try spell checking if English is availale l = 'en_US' if l in langs: d = enchant.Dict(l) print('d.check("hallo") %s' % d.check('hallo')) print('d.check("halllo") %s' % d.check('halllo')) print('d.suggest("halllo") %s' % d.suggest('halllo'))
def default_dict(self, language): return enchant.Dict(language)