示例#1
0
def check_alphabet(str, alphabet, only=True):
    ad = AlphabetDetector()
    if only:
        return ad.only_alphabet_chars(str, alphabet.upper())
    else:
        for i in str:
            if ad.is_in_alphabet(i, alphabet.upper()): return True
        return False
示例#2
0
def isArabic(s):
    ad = AlphabetDetector()
    string_without_numbers = str(s).translate(None, string.digits)
    if string_without_numbers == '':
        return False
    else:
        return ad.only_alphabet_chars(unicode(string_without_numbers),
                                      'ARABIC')
示例#3
0
 def validate_password(password):
     ad = AlphabetDetector()
     if len(password) <= 3:
         tk.messagebox.showerror('Information', 'Password too short (at least 4 symbols)')
         return False
     elif not ad.only_alphabet_chars(password, 'LATIN'):
         tk.messagebox.showerror('Information', 'Password must contain latin chars and/or numbers')
         return False
     return True
def check_alphabet(str, alphabet, only=True):
    ad = AlphabetDetector()
    uni_string = unicode(str, "utf-8")
    if only:
        return ad.only_alphabet_chars(uni_string, alphabet.upper())
    else:
        for i in uni_string:
            if ad.is_in_alphabet(i, alphabet.upper()): return True
        return False
示例#5
0
def cleanText(text):
    '''
     Function checks and repairs words with hidden latin characters in and vv.
     Function assuming that there are only latin and cyrillic characters
     in text.
    '''

    ad = AlphabetDetector()
    st = RussianStemmer()
    is_broken = False

    clean_text = []

    for word in text:
        if ad.only_alphabet_chars(word, 'CYRILLIC'):
            clean_text.append(word)
        elif ad.only_alphabet_chars(word, 'LATIN'):
            clean_text.append(word)
        else:
            is_broken = True
            clean_text.append(letterSwap(word))

    clean_text = [st.stem(word) for word in clean_text]
    return clean_text, is_broken
示例#6
0
def cleanText(text):
    '''
     Function checks and repairs words with hidden latin characters in and vv.
     Function assuming that there are only latin and cyrillic characters
     in text.
    '''

    ad = AlphabetDetector()
    st = RussianStemmer()
    is_broken = False

    clean_text = []

    for word in text:
        if ad.only_alphabet_chars(word, 'CYRILLIC'):
            clean_text.append(word)
        elif ad.only_alphabet_chars(word, 'LATIN'):
            clean_text.append(word)
        else:
            is_broken = True
            clean_text.append(letterSwap(word))

    clean_text = [st.stem(word) for word in clean_text]
    return clean_text, is_broken
示例#7
0
def kor2en(str):
    ad = AlphabetDetector()
    inputTitle = str
    outputTitle = ""
    # set invalid chars except . for extension
    invalidChars = set(string.punctuation.replace(".", ""))
    # replace invalid chars
    for i in range(len(inputTitle)):
        if inputTitle[i] not in invalidChars:
            outputTitle += inputTitle[i]
        i+=1
    if not ad.only_alphabet_chars(outputTitle,"LATIN"):
        transliter = Transliter(academic)
        outputTitle = transliter.translit(outputTitle)
    return outputTitle
示例#8
0
 def validate_username(username):
     ad = AlphabetDetector()
     if not ad.only_alphabet_chars(username, 'LATIN'):
         tk.messagebox.showerror('Information', 'Username must contain latin chars and/or numbers')
         return False
     try:
         with open(sys.path[1] + '\\users\\users.txt', 'r') as credentials:
             for line in credentials:
                 line = line.split(':')
                 if line[0] == username:
                     tk.messagebox.showerror('Information', 'Username already exists')
                     return False
         return True
     except FileNotFoundError:
         print('users.txt file not found')
         return False
示例#9
0
def check_txtcharsets(bot, event):
    place = "group"
    cid = event.data["chat"]["chatId"]
    mid = event.data["msgId"]
    from_uid = event.data["from"]["userId"]

    ad = AlphabetDetector()
    texts = extract_values(event.data, "text")

    log.debug('Authorized charsets in @[%s]: %s' %
              (cid, str(bot.parties.get_charsets(cid))))

    if bot.parties.get_charsets(cid) is not None and str(
            bot.parties.get_charsets(cid)) != "":
        for charset in list(str(bot.parties.get_charsets(cid)).split(" ")):
            if charset != "":
                for txt in texts:
                    log.debug('Testinng charset %s on %s' % (charset, txt))
                    if not ad.only_alphabet_chars(txt, charset):
                        log.debug('text %s is not authorized' % txt)
                        return False
    return True
示例#10
0
def letterSwap(word):
    '''
    Turns latin-like letters in word into cyrillic ones and reverse if fails.
    '''

    ad = AlphabetDetector()
    # latin keys cyr values
    latin_like_cyr = {
        'a': 'а',
        'c': 'с',
        'e': 'е',
        'o': 'о',
        'p': 'р',
        'y': 'у',
        'A': 'А',
        'B': 'В',
        'C': 'С',
        'E': 'Е',
        'H': 'Н',
        'K': 'К',
        'M': 'М',
        'O': 'О',
        'P': 'Р',
        'T': 'Т',
        'X': 'Х'
    }

    cyr_like_latin = {v: k for k, v in latin_like_cyr.items()}

    for char in latin_like_cyr.keys():
        word = word.replace(char, latin_like_cyr[char])

    if ad.only_alphabet_chars(word, 'CYRILLIC'):
        return word
    else:
        for char in cyr_like_latin:
            word = word.replace(char, cyr_like_latin[char])
        return word
示例#11
0
def letterSwap(word):
    '''
    Turns latin-like letters in word into cyrillic ones and reverse if fails.
    '''

    ad = AlphabetDetector()
    # latin keys cyr values
    latin_like_cyr = {'a': 'а', 'c': 'с', 'e': 'е', 'o': 'о', 'p': 'р',
                      'y': 'у', 'A': 'А', 'B': 'В', 'C': 'С', 'E': 'Е',
                      'H': 'Н', 'K': 'К', 'M': 'М', 'O': 'О', 'P': 'Р',
                      'T': 'Т', 'X': 'Х'}

    cyr_like_latin = {v: k for k, v in latin_like_cyr.items()}

    for char in latin_like_cyr.keys():
        word = word.replace(char, latin_like_cyr[char])

    if ad.only_alphabet_chars(word, 'CYRILLIC'):
        return word
    else:
        for char in cyr_like_latin:
            word = word.replace(char, cyr_like_latin[char])
        return word
示例#12
0
文件: run.py 项目: bzerroug/Projects
def preprocess(df):
    ad = AlphabetDetector()
    exclude = set(string.punctuation)

    def process_special_caracters(s):
        l = []
        for ch in s:
            if ch not in exclude:
                l.append(ch)
            else:
                l.append(' ')
        y = ''.join(l)
        return y.lstrip()

    df['job_title'] = df['job_title'].apply(lambda x: x.lower())
    df['job_title'] = df['job_title'].apply(
        lambda x: ''.join([i for i in x if not i.isdigit()]))
    df['is_alphabet'] = df['job_title'].apply(
        lambda x: ad.only_alphabet_chars(str(x), "LATIN"))
    df = df[df['is_alphabet'] == True]
    df['job_title'] = df['job_title'].apply(
        lambda x: re.sub(' +', ' ', process_special_caracters(x)))
    del df['is_alphabet']
    return df
示例#13
0
def is_new_account_bot(status):
    ret = False
    ad = AlphabetDetector()
    susp_score = 0
    egg = is_egg(status)
    if "user" not in status:
        return
    user = status["user"]
    sn = user["screen_name"]
    n = user["name"]
    bot_name = is_bot_name(sn)
    tweets = user["statuses_count"]
    friends = user["friends_count"]
    followers = user["followers_count"]
    created_at = user["created_at"]
    location = user["location"]
    time_obj = twitter_time_to_object(created_at)
    created_year = int(time_obj.strftime("%Y"))
    if egg == True:
        susp_score += 50
    if bot_name == True:
        susp_score += 100
    if created_year < 2017:
        susp_score -= 300
    if len(location) > 0:
        susp_score -= 150
    if len(sn) == 15:
        susp_score += 100
    if tweets == 0:
        susp_score += 50
    if tweets > 0:
        susp_score -= 50
    if tweets > 20:
        susp_score -= 100
    if friends == 21:
        susp_score += 100
    if friends == 0:
        susp_score += 50
    if friends != 21:
        susp_score -= 50
    if friends > 40:
        susp_score -= 100
    if friends > 100:
        susp_score -= 100
    if followers == 0:
        susp_score += 50
    if followers > 0:
        susp_score -= 200
    if len(n) < 3:
        susp_score += 100
    if ad.only_alphabet_chars(n, "CYRILLIC"):
        susp_score += 200
    if ad.only_alphabet_chars(n, "ARABIC"):
        susp_score += 200
    if ad.is_cjk(n):
        susp_score += 200
    if ad.only_alphabet_chars(n, "LATIN"):
        susp_score -= 100
    if susp_score > 0:
        return True
    else:
        return False
示例#14
0
def main(*kargs, **kwargs):
    get_kwargs(kwargs)
    train_fname = kwargs['train']
    test_fname = kwargs['test']
    logger_fname = kwargs['logger']
    swear_words_fname = kwargs['swear_words']
    wrong_words_fname = kwargs['wrong_words']
    train_clean = kwargs['train_clean']
    test_clean = kwargs['test_clean']
    embeds_clean = kwargs['embeds_clean']
    embeds_fname = kwargs['embeds']
    embeds_type = kwargs['embeds_type']
    oov_embeds_file = kwargs['oov_embeds']
    train_labels = 'data/train.labels.npy'


    # ==== Create logger ====
    logger = Logger(logging.getLogger(), logger_fname)

    # ==== Load data ====
    logger.info('Loading data...')
    train_df = load_data(train_fname)
    test_df = load_data(test_fname)

    target_labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

    # ==== Tokenize comment texts ====
    logger.info('Replacing nans and tokenizing texts...')
    list_sentences_train = train_df['comment_text'].fillna(NAN_WORD).values
    list_sentences_test = test_df['comment_text'].fillna(NAN_WORD).values

    # train_tokens, word_dict = tokenize_sentences(list_sentences_train, {})
    # test_tokens, word_dict = tokenize_sentences(list_sentences_test, word_dict)

    train_tokens, word_dict = tokenize_sentences_adv(list_sentences_train, {})
    test_tokens, word_dict = tokenize_sentences_adv(list_sentences_test, word_dict)

    word_dict[UNKNOWN_WORD] = len(word_dict)


    # # ==== Load additional data ====
    # logger.info('Loading additional data...')
    # swear_words = load_data(swear_words_fname, func=lambda x: set(x.T[0]), header=None)
    # wrong_words_dict = load_data(wrong_words_fname, func=lambda x: {val[0] : val[1] for val in x})


    # ==== Load embedding vectors and clean them ====
    logger.info('Loading embeddings...')
    embedding_list, embedding_word_dict = read_embedding_list(embeds_fname)
    embedding_size = len(embedding_list[0])

    if oov_embeds_file != '':
        logger.info('Loading embeddings for oov words...')
        embedding_list, embedding_word_dict = read_embedding_list(oov_embeds_file, embedding_word_dict, embedding_list)
        embedding_size = len(embedding_list[0])

    logger.info('Cleaning embedding list...')
    embedding_list, embedding_word_dict, oov_words = clear_embedding_list(embedding_list, embedding_word_dict, word_dict)

    # ======== Clean oov words and save them =========
    oov_cleaned = []
    ad = AlphabetDetector()
    with open('data/oov_words_{0}.txt'.format(embeds_type), 'wt+') as oov_file:
        for w in oov_words:
            if ad.only_alphabet_chars(w, "LATIN") and re.match(r'^[A-Za-z]+$', w) and (len(w) <= 15):
                oov_cleaned.append(w)
                oov_file.write(w+'\n')
    oov_file.close()


    embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict)
    embedding_list.append(np.asarray([0.] * embedding_size))
    embedding_word_dict[END_WORD] = len(embedding_word_dict)
    embedding_list.append(np.asarray([-1.] * embedding_size))

    embedding_matrix = np.array(embedding_list)

    # ==== Convert word tokens into sequences of word ids  ====
    logger.info('Converting tokens to word ids...')
    id_to_word = dict((id, word) for word, id in word_dict.items())
    train_token_ids = convert_tokens_to_ids(tokenized_sentences=train_tokens,
                                                    words_list=id_to_word,
                                                    embedding_word_dict=embedding_word_dict,
                                                    sentences_length=500)

    test_token_ids = convert_tokens_to_ids(tokenized_sentences=test_tokens,
                                                    words_list=id_to_word,
                                                    embedding_word_dict=embedding_word_dict,
                                                    sentences_length=500)

    # ==== Prepare train/test data for NN ====
    x = np.array(train_token_ids)
    y = np.array(train_df[target_labels].values)
    x_test = np.array(test_token_ids)

    # ==== Saving the results ====
    logger.info("Saving results...")
    np.save(train_clean, x)
    np.save(train_labels, y)
    np.save(test_clean, x_test)
    np.save(embeds_clean, embedding_matrix)
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist

ad = AlphabetDetector()

with open("D:/python_arcticles/texts1.txt", 'r') as f:
    words = [line.rstrip('\n') for line in f]


def is_there_number(string):
    return any(i.isdigit() for i in string)


def is_not_blank(s):
    return bool(s and s.strip())


nltk.download('stopwords')
mywords = []

for word in words:
    if ad.only_alphabet_chars(u"{}".format(word), "CYRILLIC") and is_there_number(word) == False \
            and is_not_blank(word) and word not in stopwords.words('russian'):
        mywords.append(word)
print(mywords)

fdist = FreqDist(mywords)

print(fdist.most_common(5))
示例#16
0
class Preprocessing(object):
    def __init__(self):
        # Pre-loading objects
        self.re_signatures = [
            re.compile(each) for each in stopword_lists.signatures
        ]
        self.mystem = Mystem()
        # self.morph = pymorphy2.MorphAnalyzer()

        self.mystem_lemma_dict = None
        self.ad = AlphabetDetector()

        # self.proj_path = '/'.join(inspect.getfile(inspect.currentframe()).split('/')[:-2])
        print('Preprocessing  loaded')

        # Dicts
        # self.en_dict = enchant.DictWithPWL("en_US", self.proj_path + '/Preprocessing/Dicts/IT_EN_dict.txt')
        # self.ru_aot_dict = enchant.Dict("ru_RU")
        self.stop_words = set(stopword_lists.yandex_seo_stopwords +
                              stopword_lists.custom_stop_words +
                              stopwords.words('russian'))
        self.padding_punctuation = """!"#$%&\'()*+,;<=>?[\\]^`{|}~/��"""
        self.full_punctuation = string.punctuation + '��'

    # ======================================== #
    # ######## STRING PREPROCESSING ########## #
    # ======================================== #
    @staticmethod
    def normalize(input_string):
        return input_string.lower().strip().replace('\n', ' ').replace(
            '\r', ' ').replace('\t', ' ')

    @staticmethod
    def cut_by_signature(input_string, signature_string):
        """ Cut by input search pattern (string) """
        p = re.compile(signature_string)
        search = p.search(input_string)
        try:
            start_index = search.span()[0]  # start index
            if start_index > 4:  # Do not cut from the beginning
                return input_string[:start_index]
            else:
                return input_string
        except AttributeError:
            return input_string

    def cut_by_signatures(self, input_string):
        """
        Find index of earliest signature
        with precompiled regex and cut it
        """
        beginnings = []
        for each in self.re_signatures:
            try:
                # Index of 1st found position
                beginnings.append(each.search(input_string).span()[0])
            except AttributeError:
                pass

        if beginnings:
            cut = min(beginnings)
            # Not in the beginning
            if cut > 5:
                return input_string[:cut]
            else:
                return input_string
        else:
            return input_string

    def pad_punctuation(self, input_string, punct_list=None):
        """ Used to control tokenization """
        normal_text = input_string.strip()
        padding_punctuation = punct_list if punct_list else self.padding_punctuation
        for char in padding_punctuation:
            normal_text = normal_text.replace(char, ' ' + char + ' ')
        return normal_text

    @staticmethod
    def tokenize(input_string):
        return nltk.word_tokenize(input_string)

    def get_vocab(self, series):
        return set(self.series_to_chain(series))

    def get_all_token_chain(self, series):
        return self.series_to_chain(series)

    def is_punct(self, token):
        """ True only if all chars are punct """
        for c in token:
            if c not in self.full_punctuation:
                return False
        return True

    def remove_punct(self, tokenlist):
        return [token for token in tokenlist if not self.is_punct(token)]

    @staticmethod
    def contains_digits(input_string):
        return any(char.isdigit() for char in input_string)

    def contains_punct(self, input_string):
        return any(self.is_punct(char) for char in input_string)

    def is_cyrillic(self, token):
        """
        Checks if string has only cyrillic letters
        """
        # return not(any(ord(c) < 128 for c in token))
        if self.contains_digits(token) or self.contains_punct(token):
            return False
        else:
            return self.ad.only_alphabet_chars(token, 'CYRILLIC')

    def remove_stopwords(self, tokenized_text, stopword_list=None):
        if not stopword_list:
            stopword_list = self.stop_words
        return [t for t in tokenized_text if t not in stopword_list]

    @staticmethod
    def remove_by_token_length(tokenized_text, min_len=1, max_len=25):
        return [
            t for t in tokenized_text if len(t) >= min_len and len(t) < max_len
        ]

    # ======================================== #
    # ########### POS/LEMMATIZING ############ #
    # ======================================== #
    '''
    def get_pymorphy_lemma(self, token):
        return self.morph.parse(token)[0].normal_form
    '''

    def get_mystem_lemma(self, token):
        # Returns [POS-tag, lemma] for token
        response = self.mystem.analyze(token)
        analysis = response[0].get('analysis')
        try:
            the_one = analysis[0]
            lex = the_one.get('lex')
            return lex
        except:
            return token

    def get_mystem_pos_tags(self, token):
        response = self.mystem.analyze(token)
        analysis = response[0].get('analysis')
        try:
            the_one = analysis[0]
            tag = the_one.get('gr')
            return tag
        except:
            return None

    def lemmatize_series(self, series):
        if not self.mystem_lemma_dict:
            print('Building lemma-dictionary')
            vocab = self.get_vocab(series)
            self.mystem_lemma_dict = {
                token: self.get_mystem_lemma(token)
                for token in vocab
            }
        return series.apply(
            lambda tokenlist:
            [self.mystem_lemma_dict[token] for token in tokenlist])

    def get_nltk_pos_df(self, series):
        all_tokens = self.get_all_token_chain(series)
        nltk_tags_tuple = nltk.pos_tag(all_tokens, lang='rus')
        tags = set([each[1] for each in nltk_tags_tuple])

        def get_tokens_by_tag(tag):
            # Set of tokens by input tag
            token_tag_list = list(
                filter(lambda x: x[1] == tag, nltk_tags_tuple))  # [token, tag]
            return [each[0] for each in token_tag_list]  # [token]

        tag_dict = collections.OrderedDict(
            zip(tags, [get_tokens_by_tag(tag) for tag in tags]))
        return pd.DataFrame.from_dict(tag_dict, orient='index').transpose()

    def get_mystem_pos_df(self, series):
        all_tokens = self.get_all_token_chain(series)
        mystem_tags_dict = {
            token: self.get_mystem_pos_tags(token)
            for token in set(all_tokens)
        }
        # filter_dict(mystem_tags_dict)
        mystem_tags_dict = dict(
            filter(lambda item: item[1] is not None, mystem_tags_dict.items()))

        def get_tokens_by_mystem_tag(input_tag):
            matched_tokens = [(token, all_tokens.count(token))
                              for token, tags in mystem_tags_dict.items()
                              if input_tag in tags]
            return sorted(matched_tokens, key=lambda x: x[1], reverse=True)

        # {tag: (token, count), ...}
        mystem_tag_dict = collections.OrderedDict(
            zip(stopword_lists.forbidden_mystem_tags, [
                get_tokens_by_mystem_tag(tag)
                for tag in stopword_lists.forbidden_mystem_tags
            ]))
        return pd.DataFrame.from_dict(mystem_tag_dict,
                                      orient='index').transpose()

    # ======================================== #
    # ########## Jupyter analysis ############ #
    # ======================================== #
    @staticmethod
    def stats_for_untokenized(series):
        """ Counts symbols in series of texts """
        return sum([len(each) for each in series])

    @staticmethod
    def series_to_chain(series):
        """Chained tokens in Series"""
        return list(itertools.chain.from_iterable(list(series.values)))

    def stats_for_series(self, series):
        """DF from Series stats"""
        empty_texts_indexes = list(series[series.astype(str) == '[]'].index)
        empty_texts = len(empty_texts_indexes)
        token_chain = self.series_to_chain(series)

        data = pd.DataFrame(data=[[
            len(token_chain),
            len(list(set(token_chain))),
            len(series), empty_texts,
            token_chain.count('')
        ]],
                            index=['Count'],
                            columns=[
                                'Total tokens', 'Unique tokens', 'Total texts',
                                'Empty texts', 'Empty tokens'
                            ])
        return data

    @staticmethod
    def check_empty_texts(series, original_df=None):
        """Get unprocessed text for '[]' in Series"""
        empty_texts_indexes = list(series[series.astype(str) == '[]'].index)
        if original_df is not None:
            return original_df.loc[empty_texts_indexes]
        else:
            return empty_texts_indexes

    @staticmethod
    def drop_empty_text_rows(series):
        drop_indexes = series[series.astype(str) == '[]'].index
        return series.drop(drop_indexes)

    @staticmethod
    def plot_occurrences(series, str_expression):
        """
        Detects first occurrence of str expression in text.
        Plots index distribution of occurrences.
        """
        indexes = [
            text.index(str_expression) for text in series
            if str_expression in text
        ]
        fig, ax = plt.subplots()
        ax.hist(indexes, range(0, 50))
        ax.set_xticks(np.arange(0, 51, 1))
        ax.set_xlabel('Position')
        ax.set_ylabel('Count')
        plt.title("Occurrence distribution")
        print(len(indexes), ' occurrences found')
        return ax

    def get_token_frequencies_df(self, series, topn=50):
        ctr = collections.Counter(self.series_to_chain(series))
        fdist_list = ctr.most_common(topn)
        tokens = [k for k, v in fdist_list]
        counts = [v for k, v in fdist_list]
        return pd.DataFrame({"token": tokens, "count": counts})

    def plot_token_frequencies(self, series, top_n=30):
        """ Plot frequency distribution over corpus for top_n tokens tokens """
        ctr = collections.Counter(list(self.series_to_chain(series)))
        fdist_list = ctr.most_common(top_n)
        tokens = [k for k, v in fdist_list]
        counts = [v for k, v in fdist_list]
        token_count = pd.DataFrame({"token": tokens, "count": counts})
        sns.barplot(x="count", y="token",
                    data=token_count).set_xlabel('Token appearence')

    def plot_token_distribution(self, series):
        """ Overall tokens lenghts distribution for series """
        token_lenghts = [len(x) for x in self.series_to_chain(series)]
        bow_lenghts = [len(x) for x in series]

        # Unique lens
        fig, ax = plt.subplots(ncols=2)

        ax[0].hist(token_lenghts, bins=range(0, 25))
        ax[0].set_xticks(np.arange(0, 26, 1))
        ax[0].set_xlabel('Token length')
        ax[0].set_ylabel('Count')

        ax[1].hist(bow_lenghts, bins=range(0, 25))
        ax[1].set_xticks(np.arange(0, 26, 1))
        ax[1].set_xlabel('Tokens in docs')
        ax[1].set_ylabel('Count')

        return ax

    @staticmethod
    def most_common_in_df(df):
        result = dict()
        for col in df.columns:
            try:
                col_most_freq = df[col].value_counts().reset_index()
                tokens = col_most_freq['index']
                freqs = col_most_freq[col]
                result[col] = [(t, f) for t, f in zip(tokens, freqs)]
            except:
                result[col] = [None]
        return pd.DataFrame.from_dict(result, orient='index').transpose()

    # ======================================== #
    # ###### TOKEN SEQUENCE PROCESSING ####### #
    # ======================================== #
    @staticmethod
    def get_texts_with_token(series, token):
        return [text for text in series if token in text]

    @staticmethod
    def cut_after_token(tokenlist, token, pos=0):
        """ Truncate token list after input token position """
        if token in tokenlist:
            if tokenlist.index(token) > 1:
                return tokenlist[:tokenlist.index(token) + pos]
            else:
                return tokenlist
        else:
            return tokenlist

    @staticmethod
    def get_indexes_of_token(series, token):
        """ Indexes of the token in all documents """
        indexes = [text.index(token) for text in series if token in text]
        return indexes

    @staticmethod
    def token_scope(series, token, pos):
        """ Set of tokens going before or after (by position) the given token """
        found = series.apply(lambda x: x[x.index(token) + pos]
                             if token in x else 0)
        token_set = list(set(found[found != 0]))
        return token_set

    @staticmethod
    def seq_in_series(series, seq):
        """ Return text if sequence is in token list """
        result = []
        for text in series:
            if seq[0] in text:
                index = text.index(seq[0])
                if seq == text[index:(index + len(seq))]:
                    result.append(text)
        return result

    def plot_indexes_of_token(self, series, token, x_range):
        indexes = self.get_indexes_of_token(series, token)
        fig, ax = plt.subplots()
        ax.hist(indexes, bins=range(0, x_range))
        ax.set_xticks(np.arange(0, x_range + 1, 1))
        ax.set_yticks(np.arange(0, 21, 1))
        ax.set_xlabel('Index')
        ax.set_ylabel('Count')
        plt.title(token)
        return ax

    @staticmethod
    def cut_after_seq(tokenlist, seq):
        """ Truncate document after token sequence """
        if seq[0] in tokenlist:  # if first element of seq is in text
            index = tokenlist.index(seq[0])
            if seq == tokenlist[index:(index +
                                       len(seq))]:  # if whole sequence is is
                return tokenlist[:tokenlist.index(seq[0])]
            else:
                return tokenlist
        else:
            return tokenlist

    @staticmethod
    def cut_seq(tokenlist, seq):
        """ Removes sequence from tokenized texts. """
        if seq[0] in tokenlist:
            index = tokenlist.index(seq[0])
            if seq == tokenlist[index:(index + len(seq))]:
                '''
                for each in seq:
                    del tokenlist[tokenlist.index(each)]
                return tokenlist
                '''
                return tokenlist[:index] + tokenlist[
                    index + len(seq):]  # TODO: test it
            else:
                return tokenlist
        else:
            return tokenlist

    # ======================================== #
    # ################ OTHER ################# #
    # ======================================== #
    def separate_by_category(self, series):
        """
        Separates tokens by types of chars in it (punctuation, numbers, ...)
        :param series: series of tokenized texts
        :return: dict of {category:[tokenlist]}
        """
        vocab = self.series_to_chain(series)

        result = {
            'num_punct': [],
            'alpha_num': [],
            'alpha_punct': [],
            'punct_tokens': [],
            'numeric_tokens': [],
            'alpha_tokens': [],
            'alpha_num_punct': []
        }

        for token in vocab:
            # Add flag by symbol category
            punct = [
                1 for symbol in token if (symbol in self.full_punctuation)
            ]
            numerics = [1 for symbol in token if (symbol.isnumeric())]
            alpha = [1 for symbol in token if (symbol.isalpha())]

            # If token contains all types
            if (punct and numerics) and alpha:
                result['alpha_num_punct'].append(token)

            # Double
            elif numerics and punct:
                result['num_punct'].append(token)

            elif numerics and alpha:
                result['alpha_num'].append(token)

            elif alpha and punct:
                result['alpha_punct'].append(token)

            # Simple
            elif punct:
                result['punct_tokens'].append(token)

            elif numerics:
                result['numeric_tokens'].append(token)

            elif alpha:
                result['alpha_tokens'].append(token)

        return result

    def get_categories_df(self, series):
        """
        Separates tokens by types of chars in it (punctuation, numbers, ...)
        in different categories and sort them by frequency
        """
        separated_categories_dict = self.separate_by_category(series)
        categories = pd.DataFrame.from_dict(separated_categories_dict,
                                            orient='index')
        return categories.transpose()

    # ======================================== #
    # ############## PIPELINES ############### #
    # ======================================== #
    def apply_pipeline(self, raw_string):
        """ Apply all the methods to raw string """
        normalized = self.normalize(raw_string)
        # print('normalized: ', normalized)
        signatures_cut = self.cut_by_signatures(normalized)
        # print('signatures_cut: ', signatures_cut)
        padded = self.pad_punctuation(signatures_cut)
        # print('padded: ', padded)
        tokenized = self.tokenize(padded)
        # print('tokenized: ', tokenized)
        no_punct = self.remove_punct(tokenized)
        # print('no_punct: ', no_punct)
        no_stops = self.remove_stopwords(no_punct)
        cut_by_len = [t for t in no_stops if len(t) < 25]
        lemmatized = [self.get_mystem_lemma(token) for token in cut_by_len]
        # print('lemmatized: ', lemmatized)
        return lemmatized

    def apply_short_pipeline(self, raw_string):
        """ Preprocessing for manual input in window form on client-side """
        normalized = self.normalize(raw_string)
        tokenized = self.tokenize(normalized)
        cut_by_len = [t for t in tokenized if len(t) < 25]
        lemmatized = [self.get_mystem_lemma(token) for token in cut_by_len]
        return lemmatized

    @staticmethod
    def pickle_save(data, path):
        with open(path, 'wb') as fp:
            print(type(data))
            pickle.dump(data, fp)
            print('Saved as ', path)
示例#17
0
def process(list, number, dir):
    ad = AlphabetDetector()
    nullreturn = (0, [])
    post = list[0]
    comments = list[1:]
    count = 0
    data = []
    if not ad.only_alphabet_chars(post["title"], "LATIN"):
        return nullreturn

    if len(comments) < 2:
        return nullreturn

    commentids = []
    for comment in comments:
        commentids.append(comment["id"])

    level1 = []
    level1ids = []
    level2 = []
    notlevel1 = []
    notlevel2 = []

    for comment in comments:
        if comment["parent_id"][3:] not in commentids:
            level1.append(comment)
            level1ids.append(comment["id"])
        else:
            notlevel1.append(comment)

    for comment in notlevel1:
        if comment["parent_id"][3:] not in level1ids:
            notlevel2.append(comment)
        else:
            level2.append(comment)
    if len(level2) < 1:
        return nullreturn

    for comment in level2:
        for parent in level1:
            if comment["parent_id"][3:] == parent["id"]:
                break
        if comment["parent_id"][3:] == parent[
                "id"] and comment["body"] != "[deleted]":
            print("Creating file: reddit" + "{:0>4d}".format(number + count) +
                  ".txt")
            with open(
                    os.path.join(
                        dir,
                        "reddit" + "{:0>4d}".format(number + count) + ".txt"),
                    "w") as file:
                file.write(
                    post["title"].replace('\n', ' ').replace('\r', ' ') + "\n")
                file.write(post["url"] + "\n")
                file.write(parent["author"] + ": " + fixLine(parent["body"]))
                file.write(comment["author"] + ": " + fixLine(comment["body"]))
            numcomments = findNext(
                os.path.join(
                    dir, "reddit" + "{:0>4d}".format(number + count) + ".txt"),
                parent["author"], fixLine(
                    parent["body"]), comment, notlevel2) + 2
            data.append([
                post["title"].replace(",",
                                      ""), parent["author"], comment["author"],
                "https://www.reddit.com" + post["permalink"], numcomments
            ])
            count += 1
    return (count, data)
示例#18
0
def process_text(text):
    '''Clean a text in order to be used in a language model.

        Args:
            text: A string containing the text.
        Returns:
            out_clean: A string containing the clean text.

        '''
    out = ""
    # If a line starts with these, remove it.
    words_to_stop = [
        '---------- Forwarded message ---------',
        '---------- Προωθημένο μήνυμα ----------'
    ]
    # Checks if a word is Greek
    ad = AlphabetDetector()
    # Regex that matches lines that contain the date of the message.
    date = re.compile('.*-.*-.*:.*')
    lines = text.split('\n')
    # Remove useless lines.
    for i in range(len(lines)):
        # If line is in the form yyyy-mm-dd hh:mm, remove it.
        if date.match(lines[i]) is not None:
            continue
        elif any(w in lines[i] for w in words_to_stop):
            break
        # Lines with '--' are the signature and lines with '>'
        # represent previous conversations.
        elif lines[i].startswith('--') or lines[i].startswith('>'):
            break
        elif lines[i].startswith('Στις') and lines[i].strip().endswith(
                'έγραψε:'):
            break
        elif lines[i].startswith('On') and lines[i].strip().endswith('wrote:'):
            break
        elif i < len(lines) - 1 and lines[i].startswith('Στις') and lines[
                i + 1].strip().endswith('έγραψε:'):
            break
        # Remove non-greek words.
        else:
            for word in lines[i].split(' '):
                if ad.only_alphabet_chars(word, "GREEK"):
                    out += word + ' '
                # Keep dot after non-Greek word.
                elif word.strip().endswith('.'):
                    out += '. '
        out += '\n'
    # Break line in sentences.
    out = out.replace('\r', '')
    # Set salutation as a separate sentence.
    lines = out.split('\n')
    if lines[0].strip('\n').strip().endswith(',') and (len(
            lines[1].strip('\n').strip()) == 0 or lines[1].isupper()):
        lines[0] = lines[0].strip('\n').strip()[:-1] + '.'
    out = '\n'.join(lines)

    sentences = sent_tokenize(out)
    table = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    sentences = [sent.translate(table) for sent in sentences]
    out_clean = []
    for sent in sentences:
        # split into tokens by white space
        tokens = sent.split()
        # remove remaining tokens that are not alphabetic or numeric.
        toks = []
        for token in tokens:
            if token.isdigit():
                # Convert numeric tokens in Greek text.
                toks.append(converter(token))
            elif token.isalpha():
                toks.append(token)
        # make lower case
        toks = [word.lower() for word in toks]
        if toks:
            out_clean.append(' '.join(toks))
    return out_clean
示例#19
0
def assignLocation(df, torun, lat_label, lon_label, p=True):
    if p: print("\t\t\tIn Dictionaries: assignLocation...")
    ad = AlphabetDetector()
    timedOut = []
    done = True
    for row in torun:
        # Only fill in missing cities.
        if pd.isnull(df.loc[row, "Province"]):
            # Don't run with missing lat lons.
            if not any([
                    pd.isnull(df.loc[row, lat_label]),
                    pd.isnull(df.loc[row, lon_label])
            ]):
                try:
                    geolocator = Nominatim(timeout=10,
                                           user_agent="Single_Batch_Run")

                    if p:
                        print("\t\t\t\tCities {}% complete...".format(
                            int(row / len(df) * 100)))
                    location = geolocator.reverse("{}, {}".format(
                        df.loc[row, lat_label], df.loc[row, lon_label]))

                    if "address" in location.raw:
                        dictionary = location.raw['address']
                        # Can include Region, Province and Country if desired.
                        if ~pd.isnull(df.loc[row, "Region"]):
                            if "hamlet" in dictionary:
                                if ad.only_alphabet_chars(
                                        dictionary["hamlet"], "LATIN"):
                                    df.loc[row,
                                           "Region"] = dictionary["hamlet"]

                            elif "state_district" in dictionary:
                                if ad.only_alphabet_chars(
                                        dictionary["state_district"], "LATIN"):
                                    df.loc[row, "Region"] = dictionary[
                                        "state_district"]

                            elif "county" in dictionary:
                                if ad.only_alphabet_chars(
                                        dictionary["county"], "LATIN"):
                                    df.loc[row,
                                           "Region"] = dictionary["county"]

                        else:
                            df.loc[row, "Region"] = np.NaN

                        if "Province" in dictionary:
                            if ad.only_alphabet_chars(dictionary["state"],
                                                      "LATIN"):
                                df.loc[row, "Province"] = dictionary["state"]

                        if ad.only_alphabet_chars(dictionary["country"],
                                                  "LATIN"):
                            df.loc[row, "Country"] = dictionary["country"]

                        if ~pd.isnull(df.loc[row, "City"]):
                            if "city" in dictionary:
                                if ad.only_alphabet_chars(
                                        dictionary["city"], "LATIN"):
                                    df.loc[row, "City"] = dictionary["city"]

                            elif "town" in dictionary:
                                if ad.only_alphabet_chars(
                                        dictionary["town"], "LATIN"):
                                    df.loc[row, "City"] = dictionary["town"]

                            elif "village" in dictionary:
                                if ad.only_alphabet_chars(
                                        dictionary["village"], "LATIN"):
                                    df.loc[row, "City"] = dictionary["village"]

                        else:
                            df.loc[row, "City"] = np.NaN

                        sleep(0.5)  # in seconds
                except GeocoderTimedOut:
                    if p: print("\t\tTimed Out")
                    done = False
                    timedOut.append(row)
                except GeocoderServiceError:
                    if p: print("GeocoderServiceError! (Probably certificate)")
                    done = False
                    timedOut.append(row)
    return df, done, timedOut