def check_alphabet(str, alphabet, only=True): ad = AlphabetDetector() if only: return ad.only_alphabet_chars(str, alphabet.upper()) else: for i in str: if ad.is_in_alphabet(i, alphabet.upper()): return True return False
def isArabic(s): ad = AlphabetDetector() string_without_numbers = str(s).translate(None, string.digits) if string_without_numbers == '': return False else: return ad.only_alphabet_chars(unicode(string_without_numbers), 'ARABIC')
def validate_password(password): ad = AlphabetDetector() if len(password) <= 3: tk.messagebox.showerror('Information', 'Password too short (at least 4 symbols)') return False elif not ad.only_alphabet_chars(password, 'LATIN'): tk.messagebox.showerror('Information', 'Password must contain latin chars and/or numbers') return False return True
def check_alphabet(str, alphabet, only=True): ad = AlphabetDetector() uni_string = unicode(str, "utf-8") if only: return ad.only_alphabet_chars(uni_string, alphabet.upper()) else: for i in uni_string: if ad.is_in_alphabet(i, alphabet.upper()): return True return False
def cleanText(text): ''' Function checks and repairs words with hidden latin characters in and vv. Function assuming that there are only latin and cyrillic characters in text. ''' ad = AlphabetDetector() st = RussianStemmer() is_broken = False clean_text = [] for word in text: if ad.only_alphabet_chars(word, 'CYRILLIC'): clean_text.append(word) elif ad.only_alphabet_chars(word, 'LATIN'): clean_text.append(word) else: is_broken = True clean_text.append(letterSwap(word)) clean_text = [st.stem(word) for word in clean_text] return clean_text, is_broken
def kor2en(str): ad = AlphabetDetector() inputTitle = str outputTitle = "" # set invalid chars except . for extension invalidChars = set(string.punctuation.replace(".", "")) # replace invalid chars for i in range(len(inputTitle)): if inputTitle[i] not in invalidChars: outputTitle += inputTitle[i] i+=1 if not ad.only_alphabet_chars(outputTitle,"LATIN"): transliter = Transliter(academic) outputTitle = transliter.translit(outputTitle) return outputTitle
def validate_username(username): ad = AlphabetDetector() if not ad.only_alphabet_chars(username, 'LATIN'): tk.messagebox.showerror('Information', 'Username must contain latin chars and/or numbers') return False try: with open(sys.path[1] + '\\users\\users.txt', 'r') as credentials: for line in credentials: line = line.split(':') if line[0] == username: tk.messagebox.showerror('Information', 'Username already exists') return False return True except FileNotFoundError: print('users.txt file not found') return False
def check_txtcharsets(bot, event): place = "group" cid = event.data["chat"]["chatId"] mid = event.data["msgId"] from_uid = event.data["from"]["userId"] ad = AlphabetDetector() texts = extract_values(event.data, "text") log.debug('Authorized charsets in @[%s]: %s' % (cid, str(bot.parties.get_charsets(cid)))) if bot.parties.get_charsets(cid) is not None and str( bot.parties.get_charsets(cid)) != "": for charset in list(str(bot.parties.get_charsets(cid)).split(" ")): if charset != "": for txt in texts: log.debug('Testinng charset %s on %s' % (charset, txt)) if not ad.only_alphabet_chars(txt, charset): log.debug('text %s is not authorized' % txt) return False return True
def letterSwap(word): ''' Turns latin-like letters in word into cyrillic ones and reverse if fails. ''' ad = AlphabetDetector() # latin keys cyr values latin_like_cyr = { 'a': 'а', 'c': 'с', 'e': 'е', 'o': 'о', 'p': 'р', 'y': 'у', 'A': 'А', 'B': 'В', 'C': 'С', 'E': 'Е', 'H': 'Н', 'K': 'К', 'M': 'М', 'O': 'О', 'P': 'Р', 'T': 'Т', 'X': 'Х' } cyr_like_latin = {v: k for k, v in latin_like_cyr.items()} for char in latin_like_cyr.keys(): word = word.replace(char, latin_like_cyr[char]) if ad.only_alphabet_chars(word, 'CYRILLIC'): return word else: for char in cyr_like_latin: word = word.replace(char, cyr_like_latin[char]) return word
def letterSwap(word): ''' Turns latin-like letters in word into cyrillic ones and reverse if fails. ''' ad = AlphabetDetector() # latin keys cyr values latin_like_cyr = {'a': 'а', 'c': 'с', 'e': 'е', 'o': 'о', 'p': 'р', 'y': 'у', 'A': 'А', 'B': 'В', 'C': 'С', 'E': 'Е', 'H': 'Н', 'K': 'К', 'M': 'М', 'O': 'О', 'P': 'Р', 'T': 'Т', 'X': 'Х'} cyr_like_latin = {v: k for k, v in latin_like_cyr.items()} for char in latin_like_cyr.keys(): word = word.replace(char, latin_like_cyr[char]) if ad.only_alphabet_chars(word, 'CYRILLIC'): return word else: for char in cyr_like_latin: word = word.replace(char, cyr_like_latin[char]) return word
def preprocess(df): ad = AlphabetDetector() exclude = set(string.punctuation) def process_special_caracters(s): l = [] for ch in s: if ch not in exclude: l.append(ch) else: l.append(' ') y = ''.join(l) return y.lstrip() df['job_title'] = df['job_title'].apply(lambda x: x.lower()) df['job_title'] = df['job_title'].apply( lambda x: ''.join([i for i in x if not i.isdigit()])) df['is_alphabet'] = df['job_title'].apply( lambda x: ad.only_alphabet_chars(str(x), "LATIN")) df = df[df['is_alphabet'] == True] df['job_title'] = df['job_title'].apply( lambda x: re.sub(' +', ' ', process_special_caracters(x))) del df['is_alphabet'] return df
def is_new_account_bot(status): ret = False ad = AlphabetDetector() susp_score = 0 egg = is_egg(status) if "user" not in status: return user = status["user"] sn = user["screen_name"] n = user["name"] bot_name = is_bot_name(sn) tweets = user["statuses_count"] friends = user["friends_count"] followers = user["followers_count"] created_at = user["created_at"] location = user["location"] time_obj = twitter_time_to_object(created_at) created_year = int(time_obj.strftime("%Y")) if egg == True: susp_score += 50 if bot_name == True: susp_score += 100 if created_year < 2017: susp_score -= 300 if len(location) > 0: susp_score -= 150 if len(sn) == 15: susp_score += 100 if tweets == 0: susp_score += 50 if tweets > 0: susp_score -= 50 if tweets > 20: susp_score -= 100 if friends == 21: susp_score += 100 if friends == 0: susp_score += 50 if friends != 21: susp_score -= 50 if friends > 40: susp_score -= 100 if friends > 100: susp_score -= 100 if followers == 0: susp_score += 50 if followers > 0: susp_score -= 200 if len(n) < 3: susp_score += 100 if ad.only_alphabet_chars(n, "CYRILLIC"): susp_score += 200 if ad.only_alphabet_chars(n, "ARABIC"): susp_score += 200 if ad.is_cjk(n): susp_score += 200 if ad.only_alphabet_chars(n, "LATIN"): susp_score -= 100 if susp_score > 0: return True else: return False
def main(*kargs, **kwargs): get_kwargs(kwargs) train_fname = kwargs['train'] test_fname = kwargs['test'] logger_fname = kwargs['logger'] swear_words_fname = kwargs['swear_words'] wrong_words_fname = kwargs['wrong_words'] train_clean = kwargs['train_clean'] test_clean = kwargs['test_clean'] embeds_clean = kwargs['embeds_clean'] embeds_fname = kwargs['embeds'] embeds_type = kwargs['embeds_type'] oov_embeds_file = kwargs['oov_embeds'] train_labels = 'data/train.labels.npy' # ==== Create logger ==== logger = Logger(logging.getLogger(), logger_fname) # ==== Load data ==== logger.info('Loading data...') train_df = load_data(train_fname) test_df = load_data(test_fname) target_labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] # ==== Tokenize comment texts ==== logger.info('Replacing nans and tokenizing texts...') list_sentences_train = train_df['comment_text'].fillna(NAN_WORD).values list_sentences_test = test_df['comment_text'].fillna(NAN_WORD).values # train_tokens, word_dict = tokenize_sentences(list_sentences_train, {}) # test_tokens, word_dict = tokenize_sentences(list_sentences_test, word_dict) train_tokens, word_dict = tokenize_sentences_adv(list_sentences_train, {}) test_tokens, word_dict = tokenize_sentences_adv(list_sentences_test, word_dict) word_dict[UNKNOWN_WORD] = len(word_dict) # # ==== Load additional data ==== # logger.info('Loading additional data...') # swear_words = load_data(swear_words_fname, func=lambda x: set(x.T[0]), header=None) # wrong_words_dict = load_data(wrong_words_fname, func=lambda x: {val[0] : val[1] for val in x}) # ==== Load embedding vectors and clean them ==== logger.info('Loading embeddings...') embedding_list, embedding_word_dict = read_embedding_list(embeds_fname) embedding_size = len(embedding_list[0]) if oov_embeds_file != '': logger.info('Loading embeddings for oov words...') embedding_list, embedding_word_dict = read_embedding_list(oov_embeds_file, embedding_word_dict, embedding_list) embedding_size = len(embedding_list[0]) logger.info('Cleaning embedding list...') embedding_list, embedding_word_dict, oov_words = clear_embedding_list(embedding_list, embedding_word_dict, word_dict) # ======== Clean oov words and save them ========= oov_cleaned = [] ad = AlphabetDetector() with open('data/oov_words_{0}.txt'.format(embeds_type), 'wt+') as oov_file: for w in oov_words: if ad.only_alphabet_chars(w, "LATIN") and re.match(r'^[A-Za-z]+$', w) and (len(w) <= 15): oov_cleaned.append(w) oov_file.write(w+'\n') oov_file.close() embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict) embedding_list.append(np.asarray([0.] * embedding_size)) embedding_word_dict[END_WORD] = len(embedding_word_dict) embedding_list.append(np.asarray([-1.] * embedding_size)) embedding_matrix = np.array(embedding_list) # ==== Convert word tokens into sequences of word ids ==== logger.info('Converting tokens to word ids...') id_to_word = dict((id, word) for word, id in word_dict.items()) train_token_ids = convert_tokens_to_ids(tokenized_sentences=train_tokens, words_list=id_to_word, embedding_word_dict=embedding_word_dict, sentences_length=500) test_token_ids = convert_tokens_to_ids(tokenized_sentences=test_tokens, words_list=id_to_word, embedding_word_dict=embedding_word_dict, sentences_length=500) # ==== Prepare train/test data for NN ==== x = np.array(train_token_ids) y = np.array(train_df[target_labels].values) x_test = np.array(test_token_ids) # ==== Saving the results ==== logger.info("Saving results...") np.save(train_clean, x) np.save(train_labels, y) np.save(test_clean, x_test) np.save(embeds_clean, embedding_matrix)
import nltk from nltk.corpus import stopwords from nltk.probability import FreqDist ad = AlphabetDetector() with open("D:/python_arcticles/texts1.txt", 'r') as f: words = [line.rstrip('\n') for line in f] def is_there_number(string): return any(i.isdigit() for i in string) def is_not_blank(s): return bool(s and s.strip()) nltk.download('stopwords') mywords = [] for word in words: if ad.only_alphabet_chars(u"{}".format(word), "CYRILLIC") and is_there_number(word) == False \ and is_not_blank(word) and word not in stopwords.words('russian'): mywords.append(word) print(mywords) fdist = FreqDist(mywords) print(fdist.most_common(5))
class Preprocessing(object): def __init__(self): # Pre-loading objects self.re_signatures = [ re.compile(each) for each in stopword_lists.signatures ] self.mystem = Mystem() # self.morph = pymorphy2.MorphAnalyzer() self.mystem_lemma_dict = None self.ad = AlphabetDetector() # self.proj_path = '/'.join(inspect.getfile(inspect.currentframe()).split('/')[:-2]) print('Preprocessing loaded') # Dicts # self.en_dict = enchant.DictWithPWL("en_US", self.proj_path + '/Preprocessing/Dicts/IT_EN_dict.txt') # self.ru_aot_dict = enchant.Dict("ru_RU") self.stop_words = set(stopword_lists.yandex_seo_stopwords + stopword_lists.custom_stop_words + stopwords.words('russian')) self.padding_punctuation = """!"#$%&\'()*+,;<=>?[\\]^`{|}~/��""" self.full_punctuation = string.punctuation + '��' # ======================================== # # ######## STRING PREPROCESSING ########## # # ======================================== # @staticmethod def normalize(input_string): return input_string.lower().strip().replace('\n', ' ').replace( '\r', ' ').replace('\t', ' ') @staticmethod def cut_by_signature(input_string, signature_string): """ Cut by input search pattern (string) """ p = re.compile(signature_string) search = p.search(input_string) try: start_index = search.span()[0] # start index if start_index > 4: # Do not cut from the beginning return input_string[:start_index] else: return input_string except AttributeError: return input_string def cut_by_signatures(self, input_string): """ Find index of earliest signature with precompiled regex and cut it """ beginnings = [] for each in self.re_signatures: try: # Index of 1st found position beginnings.append(each.search(input_string).span()[0]) except AttributeError: pass if beginnings: cut = min(beginnings) # Not in the beginning if cut > 5: return input_string[:cut] else: return input_string else: return input_string def pad_punctuation(self, input_string, punct_list=None): """ Used to control tokenization """ normal_text = input_string.strip() padding_punctuation = punct_list if punct_list else self.padding_punctuation for char in padding_punctuation: normal_text = normal_text.replace(char, ' ' + char + ' ') return normal_text @staticmethod def tokenize(input_string): return nltk.word_tokenize(input_string) def get_vocab(self, series): return set(self.series_to_chain(series)) def get_all_token_chain(self, series): return self.series_to_chain(series) def is_punct(self, token): """ True only if all chars are punct """ for c in token: if c not in self.full_punctuation: return False return True def remove_punct(self, tokenlist): return [token for token in tokenlist if not self.is_punct(token)] @staticmethod def contains_digits(input_string): return any(char.isdigit() for char in input_string) def contains_punct(self, input_string): return any(self.is_punct(char) for char in input_string) def is_cyrillic(self, token): """ Checks if string has only cyrillic letters """ # return not(any(ord(c) < 128 for c in token)) if self.contains_digits(token) or self.contains_punct(token): return False else: return self.ad.only_alphabet_chars(token, 'CYRILLIC') def remove_stopwords(self, tokenized_text, stopword_list=None): if not stopword_list: stopword_list = self.stop_words return [t for t in tokenized_text if t not in stopword_list] @staticmethod def remove_by_token_length(tokenized_text, min_len=1, max_len=25): return [ t for t in tokenized_text if len(t) >= min_len and len(t) < max_len ] # ======================================== # # ########### POS/LEMMATIZING ############ # # ======================================== # ''' def get_pymorphy_lemma(self, token): return self.morph.parse(token)[0].normal_form ''' def get_mystem_lemma(self, token): # Returns [POS-tag, lemma] for token response = self.mystem.analyze(token) analysis = response[0].get('analysis') try: the_one = analysis[0] lex = the_one.get('lex') return lex except: return token def get_mystem_pos_tags(self, token): response = self.mystem.analyze(token) analysis = response[0].get('analysis') try: the_one = analysis[0] tag = the_one.get('gr') return tag except: return None def lemmatize_series(self, series): if not self.mystem_lemma_dict: print('Building lemma-dictionary') vocab = self.get_vocab(series) self.mystem_lemma_dict = { token: self.get_mystem_lemma(token) for token in vocab } return series.apply( lambda tokenlist: [self.mystem_lemma_dict[token] for token in tokenlist]) def get_nltk_pos_df(self, series): all_tokens = self.get_all_token_chain(series) nltk_tags_tuple = nltk.pos_tag(all_tokens, lang='rus') tags = set([each[1] for each in nltk_tags_tuple]) def get_tokens_by_tag(tag): # Set of tokens by input tag token_tag_list = list( filter(lambda x: x[1] == tag, nltk_tags_tuple)) # [token, tag] return [each[0] for each in token_tag_list] # [token] tag_dict = collections.OrderedDict( zip(tags, [get_tokens_by_tag(tag) for tag in tags])) return pd.DataFrame.from_dict(tag_dict, orient='index').transpose() def get_mystem_pos_df(self, series): all_tokens = self.get_all_token_chain(series) mystem_tags_dict = { token: self.get_mystem_pos_tags(token) for token in set(all_tokens) } # filter_dict(mystem_tags_dict) mystem_tags_dict = dict( filter(lambda item: item[1] is not None, mystem_tags_dict.items())) def get_tokens_by_mystem_tag(input_tag): matched_tokens = [(token, all_tokens.count(token)) for token, tags in mystem_tags_dict.items() if input_tag in tags] return sorted(matched_tokens, key=lambda x: x[1], reverse=True) # {tag: (token, count), ...} mystem_tag_dict = collections.OrderedDict( zip(stopword_lists.forbidden_mystem_tags, [ get_tokens_by_mystem_tag(tag) for tag in stopword_lists.forbidden_mystem_tags ])) return pd.DataFrame.from_dict(mystem_tag_dict, orient='index').transpose() # ======================================== # # ########## Jupyter analysis ############ # # ======================================== # @staticmethod def stats_for_untokenized(series): """ Counts symbols in series of texts """ return sum([len(each) for each in series]) @staticmethod def series_to_chain(series): """Chained tokens in Series""" return list(itertools.chain.from_iterable(list(series.values))) def stats_for_series(self, series): """DF from Series stats""" empty_texts_indexes = list(series[series.astype(str) == '[]'].index) empty_texts = len(empty_texts_indexes) token_chain = self.series_to_chain(series) data = pd.DataFrame(data=[[ len(token_chain), len(list(set(token_chain))), len(series), empty_texts, token_chain.count('') ]], index=['Count'], columns=[ 'Total tokens', 'Unique tokens', 'Total texts', 'Empty texts', 'Empty tokens' ]) return data @staticmethod def check_empty_texts(series, original_df=None): """Get unprocessed text for '[]' in Series""" empty_texts_indexes = list(series[series.astype(str) == '[]'].index) if original_df is not None: return original_df.loc[empty_texts_indexes] else: return empty_texts_indexes @staticmethod def drop_empty_text_rows(series): drop_indexes = series[series.astype(str) == '[]'].index return series.drop(drop_indexes) @staticmethod def plot_occurrences(series, str_expression): """ Detects first occurrence of str expression in text. Plots index distribution of occurrences. """ indexes = [ text.index(str_expression) for text in series if str_expression in text ] fig, ax = plt.subplots() ax.hist(indexes, range(0, 50)) ax.set_xticks(np.arange(0, 51, 1)) ax.set_xlabel('Position') ax.set_ylabel('Count') plt.title("Occurrence distribution") print(len(indexes), ' occurrences found') return ax def get_token_frequencies_df(self, series, topn=50): ctr = collections.Counter(self.series_to_chain(series)) fdist_list = ctr.most_common(topn) tokens = [k for k, v in fdist_list] counts = [v for k, v in fdist_list] return pd.DataFrame({"token": tokens, "count": counts}) def plot_token_frequencies(self, series, top_n=30): """ Plot frequency distribution over corpus for top_n tokens tokens """ ctr = collections.Counter(list(self.series_to_chain(series))) fdist_list = ctr.most_common(top_n) tokens = [k for k, v in fdist_list] counts = [v for k, v in fdist_list] token_count = pd.DataFrame({"token": tokens, "count": counts}) sns.barplot(x="count", y="token", data=token_count).set_xlabel('Token appearence') def plot_token_distribution(self, series): """ Overall tokens lenghts distribution for series """ token_lenghts = [len(x) for x in self.series_to_chain(series)] bow_lenghts = [len(x) for x in series] # Unique lens fig, ax = plt.subplots(ncols=2) ax[0].hist(token_lenghts, bins=range(0, 25)) ax[0].set_xticks(np.arange(0, 26, 1)) ax[0].set_xlabel('Token length') ax[0].set_ylabel('Count') ax[1].hist(bow_lenghts, bins=range(0, 25)) ax[1].set_xticks(np.arange(0, 26, 1)) ax[1].set_xlabel('Tokens in docs') ax[1].set_ylabel('Count') return ax @staticmethod def most_common_in_df(df): result = dict() for col in df.columns: try: col_most_freq = df[col].value_counts().reset_index() tokens = col_most_freq['index'] freqs = col_most_freq[col] result[col] = [(t, f) for t, f in zip(tokens, freqs)] except: result[col] = [None] return pd.DataFrame.from_dict(result, orient='index').transpose() # ======================================== # # ###### TOKEN SEQUENCE PROCESSING ####### # # ======================================== # @staticmethod def get_texts_with_token(series, token): return [text for text in series if token in text] @staticmethod def cut_after_token(tokenlist, token, pos=0): """ Truncate token list after input token position """ if token in tokenlist: if tokenlist.index(token) > 1: return tokenlist[:tokenlist.index(token) + pos] else: return tokenlist else: return tokenlist @staticmethod def get_indexes_of_token(series, token): """ Indexes of the token in all documents """ indexes = [text.index(token) for text in series if token in text] return indexes @staticmethod def token_scope(series, token, pos): """ Set of tokens going before or after (by position) the given token """ found = series.apply(lambda x: x[x.index(token) + pos] if token in x else 0) token_set = list(set(found[found != 0])) return token_set @staticmethod def seq_in_series(series, seq): """ Return text if sequence is in token list """ result = [] for text in series: if seq[0] in text: index = text.index(seq[0]) if seq == text[index:(index + len(seq))]: result.append(text) return result def plot_indexes_of_token(self, series, token, x_range): indexes = self.get_indexes_of_token(series, token) fig, ax = plt.subplots() ax.hist(indexes, bins=range(0, x_range)) ax.set_xticks(np.arange(0, x_range + 1, 1)) ax.set_yticks(np.arange(0, 21, 1)) ax.set_xlabel('Index') ax.set_ylabel('Count') plt.title(token) return ax @staticmethod def cut_after_seq(tokenlist, seq): """ Truncate document after token sequence """ if seq[0] in tokenlist: # if first element of seq is in text index = tokenlist.index(seq[0]) if seq == tokenlist[index:(index + len(seq))]: # if whole sequence is is return tokenlist[:tokenlist.index(seq[0])] else: return tokenlist else: return tokenlist @staticmethod def cut_seq(tokenlist, seq): """ Removes sequence from tokenized texts. """ if seq[0] in tokenlist: index = tokenlist.index(seq[0]) if seq == tokenlist[index:(index + len(seq))]: ''' for each in seq: del tokenlist[tokenlist.index(each)] return tokenlist ''' return tokenlist[:index] + tokenlist[ index + len(seq):] # TODO: test it else: return tokenlist else: return tokenlist # ======================================== # # ################ OTHER ################# # # ======================================== # def separate_by_category(self, series): """ Separates tokens by types of chars in it (punctuation, numbers, ...) :param series: series of tokenized texts :return: dict of {category:[tokenlist]} """ vocab = self.series_to_chain(series) result = { 'num_punct': [], 'alpha_num': [], 'alpha_punct': [], 'punct_tokens': [], 'numeric_tokens': [], 'alpha_tokens': [], 'alpha_num_punct': [] } for token in vocab: # Add flag by symbol category punct = [ 1 for symbol in token if (symbol in self.full_punctuation) ] numerics = [1 for symbol in token if (symbol.isnumeric())] alpha = [1 for symbol in token if (symbol.isalpha())] # If token contains all types if (punct and numerics) and alpha: result['alpha_num_punct'].append(token) # Double elif numerics and punct: result['num_punct'].append(token) elif numerics and alpha: result['alpha_num'].append(token) elif alpha and punct: result['alpha_punct'].append(token) # Simple elif punct: result['punct_tokens'].append(token) elif numerics: result['numeric_tokens'].append(token) elif alpha: result['alpha_tokens'].append(token) return result def get_categories_df(self, series): """ Separates tokens by types of chars in it (punctuation, numbers, ...) in different categories and sort them by frequency """ separated_categories_dict = self.separate_by_category(series) categories = pd.DataFrame.from_dict(separated_categories_dict, orient='index') return categories.transpose() # ======================================== # # ############## PIPELINES ############### # # ======================================== # def apply_pipeline(self, raw_string): """ Apply all the methods to raw string """ normalized = self.normalize(raw_string) # print('normalized: ', normalized) signatures_cut = self.cut_by_signatures(normalized) # print('signatures_cut: ', signatures_cut) padded = self.pad_punctuation(signatures_cut) # print('padded: ', padded) tokenized = self.tokenize(padded) # print('tokenized: ', tokenized) no_punct = self.remove_punct(tokenized) # print('no_punct: ', no_punct) no_stops = self.remove_stopwords(no_punct) cut_by_len = [t for t in no_stops if len(t) < 25] lemmatized = [self.get_mystem_lemma(token) for token in cut_by_len] # print('lemmatized: ', lemmatized) return lemmatized def apply_short_pipeline(self, raw_string): """ Preprocessing for manual input in window form on client-side """ normalized = self.normalize(raw_string) tokenized = self.tokenize(normalized) cut_by_len = [t for t in tokenized if len(t) < 25] lemmatized = [self.get_mystem_lemma(token) for token in cut_by_len] return lemmatized @staticmethod def pickle_save(data, path): with open(path, 'wb') as fp: print(type(data)) pickle.dump(data, fp) print('Saved as ', path)
def process(list, number, dir): ad = AlphabetDetector() nullreturn = (0, []) post = list[0] comments = list[1:] count = 0 data = [] if not ad.only_alphabet_chars(post["title"], "LATIN"): return nullreturn if len(comments) < 2: return nullreturn commentids = [] for comment in comments: commentids.append(comment["id"]) level1 = [] level1ids = [] level2 = [] notlevel1 = [] notlevel2 = [] for comment in comments: if comment["parent_id"][3:] not in commentids: level1.append(comment) level1ids.append(comment["id"]) else: notlevel1.append(comment) for comment in notlevel1: if comment["parent_id"][3:] not in level1ids: notlevel2.append(comment) else: level2.append(comment) if len(level2) < 1: return nullreturn for comment in level2: for parent in level1: if comment["parent_id"][3:] == parent["id"]: break if comment["parent_id"][3:] == parent[ "id"] and comment["body"] != "[deleted]": print("Creating file: reddit" + "{:0>4d}".format(number + count) + ".txt") with open( os.path.join( dir, "reddit" + "{:0>4d}".format(number + count) + ".txt"), "w") as file: file.write( post["title"].replace('\n', ' ').replace('\r', ' ') + "\n") file.write(post["url"] + "\n") file.write(parent["author"] + ": " + fixLine(parent["body"])) file.write(comment["author"] + ": " + fixLine(comment["body"])) numcomments = findNext( os.path.join( dir, "reddit" + "{:0>4d}".format(number + count) + ".txt"), parent["author"], fixLine( parent["body"]), comment, notlevel2) + 2 data.append([ post["title"].replace(",", ""), parent["author"], comment["author"], "https://www.reddit.com" + post["permalink"], numcomments ]) count += 1 return (count, data)
def process_text(text): '''Clean a text in order to be used in a language model. Args: text: A string containing the text. Returns: out_clean: A string containing the clean text. ''' out = "" # If a line starts with these, remove it. words_to_stop = [ '---------- Forwarded message ---------', '---------- Προωθημένο μήνυμα ----------' ] # Checks if a word is Greek ad = AlphabetDetector() # Regex that matches lines that contain the date of the message. date = re.compile('.*-.*-.*:.*') lines = text.split('\n') # Remove useless lines. for i in range(len(lines)): # If line is in the form yyyy-mm-dd hh:mm, remove it. if date.match(lines[i]) is not None: continue elif any(w in lines[i] for w in words_to_stop): break # Lines with '--' are the signature and lines with '>' # represent previous conversations. elif lines[i].startswith('--') or lines[i].startswith('>'): break elif lines[i].startswith('Στις') and lines[i].strip().endswith( 'έγραψε:'): break elif lines[i].startswith('On') and lines[i].strip().endswith('wrote:'): break elif i < len(lines) - 1 and lines[i].startswith('Στις') and lines[ i + 1].strip().endswith('έγραψε:'): break # Remove non-greek words. else: for word in lines[i].split(' '): if ad.only_alphabet_chars(word, "GREEK"): out += word + ' ' # Keep dot after non-Greek word. elif word.strip().endswith('.'): out += '. ' out += '\n' # Break line in sentences. out = out.replace('\r', '') # Set salutation as a separate sentence. lines = out.split('\n') if lines[0].strip('\n').strip().endswith(',') and (len( lines[1].strip('\n').strip()) == 0 or lines[1].isupper()): lines[0] = lines[0].strip('\n').strip()[:-1] + '.' out = '\n'.join(lines) sentences = sent_tokenize(out) table = str.maketrans(string.punctuation, ' ' * len(string.punctuation)) sentences = [sent.translate(table) for sent in sentences] out_clean = [] for sent in sentences: # split into tokens by white space tokens = sent.split() # remove remaining tokens that are not alphabetic or numeric. toks = [] for token in tokens: if token.isdigit(): # Convert numeric tokens in Greek text. toks.append(converter(token)) elif token.isalpha(): toks.append(token) # make lower case toks = [word.lower() for word in toks] if toks: out_clean.append(' '.join(toks)) return out_clean
def assignLocation(df, torun, lat_label, lon_label, p=True): if p: print("\t\t\tIn Dictionaries: assignLocation...") ad = AlphabetDetector() timedOut = [] done = True for row in torun: # Only fill in missing cities. if pd.isnull(df.loc[row, "Province"]): # Don't run with missing lat lons. if not any([ pd.isnull(df.loc[row, lat_label]), pd.isnull(df.loc[row, lon_label]) ]): try: geolocator = Nominatim(timeout=10, user_agent="Single_Batch_Run") if p: print("\t\t\t\tCities {}% complete...".format( int(row / len(df) * 100))) location = geolocator.reverse("{}, {}".format( df.loc[row, lat_label], df.loc[row, lon_label])) if "address" in location.raw: dictionary = location.raw['address'] # Can include Region, Province and Country if desired. if ~pd.isnull(df.loc[row, "Region"]): if "hamlet" in dictionary: if ad.only_alphabet_chars( dictionary["hamlet"], "LATIN"): df.loc[row, "Region"] = dictionary["hamlet"] elif "state_district" in dictionary: if ad.only_alphabet_chars( dictionary["state_district"], "LATIN"): df.loc[row, "Region"] = dictionary[ "state_district"] elif "county" in dictionary: if ad.only_alphabet_chars( dictionary["county"], "LATIN"): df.loc[row, "Region"] = dictionary["county"] else: df.loc[row, "Region"] = np.NaN if "Province" in dictionary: if ad.only_alphabet_chars(dictionary["state"], "LATIN"): df.loc[row, "Province"] = dictionary["state"] if ad.only_alphabet_chars(dictionary["country"], "LATIN"): df.loc[row, "Country"] = dictionary["country"] if ~pd.isnull(df.loc[row, "City"]): if "city" in dictionary: if ad.only_alphabet_chars( dictionary["city"], "LATIN"): df.loc[row, "City"] = dictionary["city"] elif "town" in dictionary: if ad.only_alphabet_chars( dictionary["town"], "LATIN"): df.loc[row, "City"] = dictionary["town"] elif "village" in dictionary: if ad.only_alphabet_chars( dictionary["village"], "LATIN"): df.loc[row, "City"] = dictionary["village"] else: df.loc[row, "City"] = np.NaN sleep(0.5) # in seconds except GeocoderTimedOut: if p: print("\t\tTimed Out") done = False timedOut.append(row) except GeocoderServiceError: if p: print("GeocoderServiceError! (Probably certificate)") done = False timedOut.append(row) return df, done, timedOut