示例#1
0
    def add_to_index(self, document, doc_id):
        # parser = HTMLParser(text=document['data'])
        text = document['data']

        # print(1)

        nlp = Russian()
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(text)
        tokens = [token.lower() for token in tokens]
        tmp_text = ' '.join(tokens)
        if len(tokens) > 10e5:
            return
        self.doc_iter += 1
        nlp.max_length = 10e7
        doc_text = nlp(tmp_text, disable=['ner', 'parser'])
        lemmas = []
        # for lemma in tokens:
        for s in doc_text:
            lemma = s.lemma_
            lemmas.append(lemma)
            # if lemma not in set(stopwords.words('russian')) \
        #             and lemma not in set(stopwords.words('english')) \
        #             and len(lemma) > 1:
        #         lemmas.append(lemma)
        freq = FreqDist(lemmas)
        for k, v in freq.most_common():
            if k not in self.global_index:
                self.global_index[k] = []
            self.global_index[k].append((doc_id, v))
示例#2
0
    def __init__(self):
        super(ClassifierKNN, self).__init__()
        self.data_sets = []
        self.texts = {}
        self.options = {}
        self.threshold = 0.3

        self.russian_stop_words = stop_words.get_stop_words('russian')
        self.parser = Russian()
        self.stop_list = set(stopwords.words('russian') + list(
            self.russian_stop_words))
        # List of symbols we don't care about
        self.escape_symbols = ' '.join(string.punctuation).split(' ') +\
                              ['-----', '---', '...', '“', '”', '\'ve']
        # the vectorizer and classifer to use
        # note that I changed the tokenizer in CountVectorizer
        # to use a custom function using spaCy's tokenizer
        self.vectorizer = CountVectorizer(
            tokenizer=self.tokenizeText,
            ngram_range=(1, 1)
        )
        self.clf = KNeighborsClassifier(
            n_neighbors=20, weights='uniform',
            algorithm='auto'#, metric='mahalanobis'
        )

        # the pipeline to clean, tokenize, vectorize, and classify
        self.pipe = Pipeline(
            [
                ('cleanText', ClassifierKNN.CleanTextTransformer()),
                ('vectorizer', self.vectorizer),
                ('clf', self.clf)
            ]
        )
示例#3
0
    def __init__(self):
        super(ClassifierSpacy, self).__init__()
        self.data_sets = []
        self.texts = {}
        self.options = {}
        self.threshold = 0.3

        self.russian_stop_words = stop_words.get_stop_words('russian')
        self.parser = Russian()
        self.stop_list = set(stopwords.words('russian') + list(
            self.russian_stop_words))
        # List of symbols we don't care about
        self.escape_symbols = ' '.join(string.punctuation).split(' ') +\
                              ['-----', '---', '...', '“', '”', '\'ve']
        # the vectorizer and classifer to use
        # note that I changed the tokenizer in CountVectorizer
        # to use a custom function using spaCy's tokenizer
        self.vectorizer = CountVectorizer(
            tokenizer=self.tokenizeText,
            ngram_range=(1, 1)
        )
        self.clf = MultinomialNB()
        # self.clf = LinearSVC()
        # self.clf = SVC(probability=True)

        # the pipeline to clean, tokenize, vectorize, and classify
        self.pipe = Pipeline(
            [
                ('cleanText', ClassifierSpacy.CleanTextTransformer()),
                ('vectorizer', self.vectorizer),
                ('clf', self.clf)
            ]
        )
示例#4
0
def load_spacy_model(model):
    if model == "ru":
        try:
            from spacy.lang.ru import Russian
            return Russian()
            # import stanfordnlp
            # from spacy_stanfordnlp import StanfordNLPLanguage
            # snlp = stanfordnlp.Pipeline(lang="ru", models_dir="/cs/labs/oabend/lovodkin93/TUPA1_project/stanfordnlp_resources")
            # return StanfordNLPLanguage(snlp)

            #import stanza
            #return stanza.Pipeline(lang='ru', processors='tokenize,pos,lemma,depparse,ner', models_dir="//stanza_resources")
        except OSError as e:
            raise OSError(
                "Failed to get spaCy Russian model. Install it using "
                "pip install git+https://github.com/aatimofeev/spacy_russian_tokenizer.git"
            ) from e
    import spacy
    try:
        return spacy.load(model)
    except OSError:
        spacy.cli.download(model)
        # Workaround from https://github.com/explosion/spaCy/issues/3435#issuecomment-474580269
        from spacy.cli import link
        from spacy.util import get_package_path
        link(model, model, force=True, model_path=get_package_path(model))
        try:
            return spacy.load(model)
        except OSError as e:
            raise OSError(
                "Failed to get spaCy model. Download it manually using "
                "`python -m spacy download %s`." % model) from e
示例#5
0
def load_spacy_model(model):
    if model == "ru":
        try:
            from spacy.lang.ru import Russian
            return Russian()
        except OSError as e:
            raise OSError(
                "Failed to get spaCy Russian model. Install it using "
                "pip install git+https://github.com/aatimofeev/spacy_russian_tokenizer.git"
            ) from e
    import spacy
    try:
        return spacy.load(model)
    except OSError:
        spacy.cli.download(model)
        # Workaround from https://github.com/explosion/spaCy/issues/3435#issuecomment-474580269
        from spacy.cli import link
        from spacy.util import get_package_path
        link(model, model, force=True, model_path=get_package_path(model))
        try:
            return spacy.load(model)
        except OSError as e:
            raise OSError(
                "Failed to get spaCy model. Download it manually using "
                "`python -m spacy download %s`." % model) from e
示例#6
0
def count_simple_stats():
    with open('data/articles.json', 'r', encoding='utf8') as f:
        json_str = f.readlines()[0]
    articles = json.loads(json_str)
    nlp = Russian()
    russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS)
    nlp.add_pipe(russian_tokenizer, name='russian_tokenizer')
    texts_count = 0
    sent_count = 0
    words_count = 0
    symbols_count = 0
    for title in articles:
        text = articles[title][0].strip()
        texts_count += 1
        sents = nltk.sent_tokenize(text, language="russian")
        sent_count += len(sents)
        tokens = nlp(text)
        words_count += len(tokens)
        symbols = [symb for symb in text if symb != ' ' and symb != '\n']
        symbols_count += len(symbols)
        # print([token.txt for token in tokens])
    print("Texts count:", texts_count)
    print("Sentences count:", sent_count)
    print("Words count:", words_count)
    print("Symbols count:", symbols_count)
示例#7
0
    def __init__(self):
        from spacy.lang.ru import Russian
        from spacy_russian_tokenizer import (RussianTokenizer, MERGE_PATTERNS,
                                             SYNTAGRUS_RARE_CASES)

        self.nlp = Russian()
        self.nlp.add_pipe(RussianTokenizer(
            self.nlp, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES),
                          name='russian_tokenizer')
示例#8
0
def spacy_tokenize(text):
    from spacy.lang.ru import Russian

    global NLP
    if not NLP:
        NLP = Russian()

    doc = NLP(text)
    chunks = [token.text for token in doc]
    return find_substrings(chunks, text)
示例#9
0
def tokenize():
    with open('data/articles.json', 'r', encoding='utf8') as f:
        json_str = f.readlines()[0]
    articles = json.loads(json_str)
    nlp = Russian()
    russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS)
    nlp.add_pipe(russian_tokenizer, name='russian_tokenizer')
    for title in articles:
        text = articles[title][0].strip()
        texts_count += 1
        sents = nltk.sent_tokenize(text, language="russian")
        sent_count += len(sents)
        tokens = nlp(text)
        words_count += len(tokens)
        symbols = [symb for symb in text if symb != ' ' and symb != '\n']
        symbols_count += len(symbols)
示例#10
0
def main():
    _fn = 'test_ru.json'
    try:
        with open(_fn, 'r', encoding='utf-8') as _fd:
            _buf = json.load(_fd)
    except IOError as _err:
        print(_err)
        return
    # _text = json.dumps(_buf)
    _text = str(_buf)
    print(_text)
    input("Press ENTER for continue...")
    nlp_obj = Russian()
    _doc = nlp_obj(_text)
    for _token in _doc:
        print(_token.text)
    input("Press ENTER for continue...")
示例#11
0
 def get_tokenizer(lang):
     if lang == "zh":
         # nlp = spacy.load("zh_core_web_sm")
         nlp = Chinese()
     elif lang == "en":
         # nlp = spacy.load("en_core_web_sm")
         nlp = English()
     elif lang == "cs":
         nlp = Czech()
     elif lang == "de":
         # nlp = spacy.load("de_core_web_sm")
         nlp = German()
     elif lang == "ru":
         nlp = Russian()
     else:
         raise Exception("Unacceptable language.")
     return nlp
def text_decomposition(text, lang='de'):
    if lang == 'de':
        nlp = spacy.load('de_core_news_md')
    elif lang == 'en':
        nlp = spacy.load("en_core_web_md")
    elif lang == 'ru':
        nlp = Russian()
        sentencizer = nlp.create_pipe("sentencizer")
        nlp.add_pipe(sentencizer)
    else:
        print("Unsupported language. Choose from ['en', 'de', 'ru']")
        return

    doc = nlp(text)
    sentences = list()
    for sent in doc.sents:
        sentences.append(sent.text)
    return sentences
示例#13
0
    def __init__(
            self,
            regexp_suffixes=BASE_SUFFIXES_REGEXPS,
            regexp_prefixes=BASE_PREFIXES_REGEXPS,
            regexp_infixes=BASE_INFIXES_REGEXPS,
            regexp_base_token_matches=BASE_TOKEN_MATCH,
            merge_patterns=tuple(MERGE_PATTERNS + SYNTAGRUS_RARE_CASES),
            terminal_patterns=tuple(NO_TERMINAL_PATTERNS),
    ):
        """
        Parameters
        ----------
        regexp_suffixes : list of dict
            Dict in spacy format. See above for explanation of spacy format.
        regexp_prefixes : list of dict
            Dict in spacy format.
        regexp_infixes : list of dict
            Dict in spacy format.
        regexp_base_token_matches : list of dict
            Dict in spacy format.
        merge_patterns : list of dict
            Dict in spacy format.
        terminal_patterns : list of dict
            Dict in spacy format.
        """
        merge_patterns = list(merge_patterns)
        terminal_patterns = list(terminal_patterns)

        self.nlp_pipeline = Russian()
        self.nlp_pipeline.tokenizer = self.create_custom_pretokenizer(
            nlp_model=self.nlp_pipeline,
            prefix_regexp=regexp_prefixes,
            suffix_regexp=regexp_suffixes,
            infix_regexp=regexp_infixes,
            token_match_regexp=regexp_base_token_matches,
        )

        self.tokenizer_postprocesser = RussianTokenizer(
            self.nlp_pipeline,
            merge_patterns=merge_patterns,
            terminal_patterns=terminal_patterns)

        self.nlp_pipeline.add_pipe(self.tokenizer_postprocesser,
                                   name='russian_tokenizer_postprocesser')
示例#14
0
def spacy_tokenize2(text):
    from spacy.lang.ru import Russian
    from spacy_russian_tokenizer import (
        RussianTokenizer,
        MERGE_PATTERNS,
        SYNTAGRUS_RARE_CASES
    )

    global NLP2
    if not NLP2:
        NLP2 = Russian()
        NLP2.add_pipe(
            RussianTokenizer(NLP2, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES),
            name='russian_tokenizer'
        )

    doc = NLP2(text)
    chunks = [token.text for token in doc]
    return find_substrings(chunks, text)
    def make_document_tf(self, document):
        tf = {}
        parser = HTMLParser(text=document['data'])
        text = parser.get_text()

        nlp = Russian()
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(text)
        tmp_text = ' '.join(tokens)
        doc_text = nlp(tmp_text)
        lemmas = []
        for s in doc_text:
            if s.lemma_ not in set(stopwords.words('russian')) \
                    and s.lemma_ not in set(stopwords.words('english')):
                lemmas.append(s.lemma_)
        freq = FreqDist(lemmas)
        print(freq.most_common(10))

        # TODO most_common -> all
        for k, v in freq.most_common(10):
            tf = self.update_tf(tf, k, document['url'], v)

        return tf
示例#16
0
    def spacy_sentence_scores(self) -> Dict[str, float]:
        nlp = Russian()
        sentencizer = nlp.create_pipe('sentencizer')
        nlp.add_pipe(sentencizer)

        raw_text = self.text
        docx = nlp(raw_text)
        stopwords = list(STOP_WORDS)

        word_frequencies = {}
        for word in docx:
            if word.text not in stopwords:
                word = MORPH.parse(word.text)[0].normalized
                if not ('PREP' in word.tag or 'CONJ' in word.tag or 'PRCL' in word.tag or 'INTJ' in word.tag):
                    if word.word not in word_frequencies.keys():
                        word_frequencies[word.word] = 1
                    else:
                        word_frequencies[word.word] += 1

        maximum_frequency = max(word_frequencies.values())

        for word in word_frequencies.keys():
            word_frequencies[word] = (word_frequencies[word] / maximum_frequency)
        sentence_list = [sentence for sentence in docx.sents]

        sentence_scores = {}
        for sent in sentence_list:
            for word in sent:
                word = MORPH.parse(word.text)[0].normalized
                if not ('PREP' in word.tag or 'CONJ' in word.tag or 'PRCL' in word.tag or 'INTJ' in word.tag):
                    if word.word in word_frequencies.keys():
                        if sent not in sentence_scores.keys():
                            sentence_scores[sent] = word_frequencies[word.word]
                        else:
                            sentence_scores[sent] += word_frequencies[word.word]

        return sentence_scores
示例#17
0
文件: utils.py 项目: vmkhlv/histqa
 def __init__(self):
     self.nlp = Russian()
     self.nlp.add_pipe(RussianTokenizer(self.nlp, MERGE_PATTERNS),
                       name="russian_tokenizer")
示例#18
0
    def __init__(self,
                 lowercase=True,
                 keepcaps=False,
                 normalize=3,
                 ignore_quotes=False,
                 ignore_reddit_quotes=False,
                 ignore_stopwords=False,
                 stem=False,
                 remove_punct=True,
                 remove_breaks=True,
                 decontract=False,
                 twitter_handles=False,
                 urls=False,
                 hashtags=False,
                 numbers=False,
                 subreddits=False,
                 reddit_usernames=False,
                 emails=False,
                 extra_patterns=None,
                 keep_untokenized=None,
                 whitespaces_to_underscores=True,
                 remove_nonunicode=False,
                 pos_emojis=None,
                 neg_emojis=None,
                 neutral_emojis=None,
                 print_url_warnings=False,
                 latin_chars_fix=False,
                 ngrams=1):
        self.params = locals()

        #self._nlp = English()
        self._nlp = Russian()
        russian_tokenizer = RussianTokenizer(
            self._nlp, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES)
        self._nlp.add_pipe(russian_tokenizer, name='russian_tokenizer')

        self._merging_matcher = Matcher(self._nlp.vocab)
        self._matcher = Matcher(self._nlp.vocab)

        self._replacements = {}
        self._domains = {}
        self._realnames = {}
        self._stopwords = None

        alpha_digits_flag = self._nlp.vocab.add_flag(alpha_digits_check)
        hashtag_flag = self._nlp.vocab.add_flag(hashtag_check)
        twitter_handle_flag = self._nlp.vocab.add_flag(twitter_handle_check)

        self._merging_matcher.add('HASHTAG', None, [{
            'ORTH': '#'
        }, {
            'IS_ASCII': True
        }])
        self._merging_matcher.add('SUBREDDIT', None, [{
            'ORTH': '/r'
        }, {
            'ORTH': '/'
        }, {
            alpha_digits_flag: True
        }], [{
            'ORTH': 'r'
        }, {
            'ORTH': '/'
        }, {
            alpha_digits_flag: True
        }])
        self._merging_matcher.add('REDDIT_USERNAME', None,
                                  [{
                                      'ORTH': '/u'
                                  }, {
                                      'ORTH': '/'
                                  }, {
                                      alpha_digits_flag: True
                                  }], [{
                                      'ORTH': 'u'
                                  }, {
                                      'ORTH': '/'
                                  }, {
                                      alpha_digits_flag: True
                                  }])

        if isinstance(ignore_stopwords, str) and ('nltk' in sys.modules):
            try:
                self._stopwords = stopwords.words(ignore_stopwords)
            except OSError:
                raise ValueError('Language {} was not found by NLTK'.format(
                    ignore_stopwords))
        elif ignore_stopwords is True:
            self._matcher.add('STOPWORDS', self._remove_token, [{
                'IS_STOP': True
            }])
        elif isinstance(ignore_stopwords, list):
            self._stopwords = [word.lower() for word in ignore_stopwords]
        elif ignore_stopwords is not False:
            raise TypeError(
                'Type {} is not supported by ignore_stopwords parameter or NLTK is not installed'
                .format(type(ignore_stopwords)))

        if lowercase and (not keepcaps):
            self._matcher.add('LOWERCASE', self._lowercase, [{
                'IS_LOWER': False
            }])
        elif lowercase and keepcaps:
            self._matcher.add('LOWERCASE', self._lowercase, [{
                'IS_LOWER': False,
                'IS_UPPER': False
            }])

        if remove_punct:
            self._matcher.add('PUNCTUATION', self._remove_token,
                              [{
                                  'IS_PUNCT': True
                              }])

        if remove_breaks:

            def break_check(text):
                return bool(BREAKS_RE.fullmatch(text))

            break_flag = self._nlp.vocab.add_flag(break_check)
            self._matcher.add('BREAK', self._remove_token, [{
                break_flag: True
            }])

        if normalize:

            def normalize_check(text):
                return bool(NORMALIZE_RE.search(text))

            normalize_flag = self._nlp.vocab.add_flag(normalize_check)
            self._matcher.add('NORMALIZE', self._normalize,
                              [{
                                  normalize_flag: True
                              }])

        if numbers is not False:
            self._matcher.add('NUMBER', self._replace_token, [{
                'LIKE_NUM': True
            }])
            self._replacements['NUMBER'] = numbers

        if urls is not False:
            if urls in [
                    'domain', 'domain_unwrap_fast', 'domain_unwrap', 'title'
            ]:
                self._urls = urls
                self._matcher.add('URL', self._process_url, [{
                    'LIKE_URL': True
                }])
            elif isinstance(urls, dict):
                self._domains = urls
                self._urls = 'domain_unwrap_fast'
                self._matcher.add('URL', self._process_url, [{
                    'LIKE_URL': True
                }])
            else:
                self._matcher.add('URL', self._replace_token, [{
                    'LIKE_URL': True
                }])
                self._replacements['URL'] = urls

        if emails is not False:
            self._matcher.add('EMAIL', self._replace_token, [{
                'LIKE_EMAIL': True
            }])
            self._replacements['EMAIL'] = emails

        if reddit_usernames is not False:

            def reddit_username_check(text):
                return bool(REDDITORS_RE.fullmatch(text))

            reddit_username_flag = self._nlp.vocab.add_flag(
                reddit_username_check)
            self._matcher.add('REDDIT_USERNAME', self._replace_token,
                              [{
                                  reddit_username_flag: True
                              }])
            self._replacements['REDDIT_USERNAME'] = reddit_usernames

        if subreddits is not False:

            def subreddit_check(text):
                return bool(SUBREDDITS_RE.fullmatch(text))

            subreddit_flag = self._nlp.vocab.add_flag(subreddit_check)
            self._matcher.add('SUBREDDIT', self._replace_token,
                              [{
                                  subreddit_flag: True
                              }])
            self._replacements['SUBREDDIT'] = subreddits

        if twitter_handles is not False:
            self._matcher.add('TWITTER_HANDLE', self._handles_postprocess,
                              [{
                                  twitter_handle_flag: True
                              }])

        if hashtags is not False:
            self._matcher.add('HASHTAG', self._hashtag_postprocess,
                              [{
                                  hashtag_flag: True
                              }])

        if hashtags == 'split' or twitter_handles == 'split':
            file = os.path.join(DATA_PATH, 'wordsfreq_wiki2.txt')
            with open(file) as f:
                self._words = f.read().split()
            self._wordcost = dict((k, log((i + 1) * log(len(self._words))))
                                  for i, k in enumerate(self._words))
            self._maxword = max(len(x) for x in self._words)

        if twitter_handles == 'realname':
            with open(os.path.join(DATA_PATH, 'realnames.json')) as f:
                self._realnames = json.load(f)

        if ignore_quotes:
            self._merging_matcher.add('QUOTE', None, [{
                'ORTH': '"'
            }, {
                'OP': '*',
                'IS_ASCII': True
            }, {
                'ORTH': '"'
            }])

            def doublequote_check(text):
                return bool(QUOTES_RE.fullmatch(text))

            doublequote_flag = self._nlp.vocab.add_flag(doublequote_check)
            self._matcher.add('DOUBLE_QUOTES', self._remove_token,
                              [{
                                  doublequote_flag: True
                              }])

        if self._stopwords:

            def stopword_check(text):
                return bool(text.lower() in self._stopwords)

            stopword_flag = self._nlp.vocab.add_flag(stopword_check)
            self._matcher.add('STOPWORD', self._remove_token,
                              [{
                                  stopword_flag: True
                              }])

        if keep_untokenized is not None:
            if not isinstance(keep_untokenized, list):
                raise ValueError(
                    "keep_untokenized has to be either None or a list")
            for i, phrase in enumerate(keep_untokenized):
                phrase_tokens = phrase.split(' ')
                rule = []
                for token in phrase_tokens:
                    rule.append({'LOWER': token.lower()})
                self._merging_matcher.add('RULE_' + str(i), None, rule)

        if pos_emojis:
            if not isinstance(pos_emojis, list):
                pos_emojis = POS_EMOJIS
            pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emojis]
            self._matcher.add('HAPPY', self._replace_token, *pos_patterns)
            self._replacements['HAPPY'] = 'POS_EMOJI'

        if neg_emojis:
            if not isinstance(neg_emojis, list):
                neg_emojis = NEG_EMOJIS
            neg_patterns = [[{'ORTH': emoji}] for emoji in neg_emojis]
            self._matcher.add('SAD', self._replace_token, *neg_patterns)
            self._replacements['SAD'] = 'NEG_EMOJI'

        if neutral_emojis:
            if not isinstance(neutral_emojis, list):
                neutral_emojis = NEUTRAL_EMOJIS
            neutral_patterns = [[{'ORTH': emoji}] for emoji in neutral_emojis]
            self._matcher.add('NEUTRAL', self._replace_token,
                              *neutral_patterns)
            self._replacements['NEUTRAL'] = 'NEUTRAL_EMOJI'

        if isinstance(extra_patterns, list):
            self._flags = {}
            for name, re_pattern, replacement_token in extra_patterns:

                def flag(text):
                    return bool(re_pattern.match(text))

                self._flags[name] = self._nlp.vocab.add_flag(flag)
                self._matcher.add(name, self._replace_token,
                                  [{
                                      self._flags[name]: True
                                  }])
                self._replacements[name] = replacement_token

        if stem and ('nltk' in sys.modules):
            if stem == 'stem':
                self._stemmer = PorterStemmer()
            elif stem == 'lemm':
                self._stemmer = WordNetLemmatizer()
            #elif stem == 'rus':
            #    self._stemmer = SnowballStemmer("russian")
            else:
                raise ValueError(
                    'Stemming method {} is not supported'.format(stem))
            self._matcher.add('WORD_TO_STEM', self._stem_word,
                              [{
                                  'IS_ALPHA': True
                              }])

        retokenize_flag = self._nlp.vocab.add_flag(retokenize_check)
        self._matcher.add('RETOKENIZE', self._retokenize,
                          [{
                              retokenize_flag: True,
                              'IS_PUNCT': False,
                              'LIKE_URL': False,
                              'LIKE_EMAIL': False,
                              'LIKE_NUM': False,
                              hashtag_flag: False,
                              twitter_handle_flag: False
                          }])

        self._nlp.add_pipe(self._merge_doc, name='merge_doc', last=True)
        self._nlp.add_pipe(self._match_doc, name='match_doc', last=True)
        self._nlp.add_pipe(self._postproc_doc, name='postproc_doc', last=True)
示例#19
0
 def __init__(self):
     
     self.rus_word_tokenizer = Russian()
     
     pipe = RussianTokenizer(self.rus_word_tokenizer, MERGE_PATTERNS + SYNTAGRUS_RARE_CASES)
     self.rus_word_tokenizer.add_pipe(pipe, name='russian_tokenizer')
示例#20
0
from spacy.lang.en import English
from spacy.lang.es import Spanish
from spacy.lang.fr import French
from spacy.lang.zh import Chinese
from spacy.lang.ru import Russian
from spacy.lang.ar import Arabic
from spacy.lang.de import German
from spacy.lang.uk import Ukrainian
from spacy.lang.ro import Romanian

lang_id_to_spacy = {
    'en': English(),
    'es': Spanish(),
    'fr': French(),
    'zh-cn': Chinese(),
    'ru': Russian(),
    'ar': Arabic(),
    'de': German(),
    'uk': Ukrainian(),
    'ro': Romanian()
}

#####################
### Globals
#####################

reddit = Reddit(client_id='OFsSWAsbFrzLpg',
                client_secret='tRReu7VAAyxgEXbGqaE19_OUrR4',
                password='******',
                user_agent='testscript by /u/pocaguirre',
                username='******')
示例#21
0
 def lemmatize(self, token, pos_tag):
     nlp = Russian()
     docs = iter(nlp(token))
     return next(docs).lemma_
示例#22
0
def main(args):

    if not os.path.exists(args.save_path):
        os.mkdir(args.save_path)

    tokenizers = {
        "en": spacy.load("en_core_web_sm"),
        "zh": spacy.load("zh_core_web_sm"),
        "ru": Russian(),
        "fr": spacy.load("fr_core_news_sm"),
        "es": spacy.load("es_core_news_sm"),
        "ar": WordTokenizer("arabic"),
    }

    src_tokenizer = None
    if args.src_tok is not None:
        src_tok = tokenizers[args.src_tok]
        if args.src_tok == "ar":

            def tokenize_src(text):
                return [tok for tok in src_tok.tokenize(text)]

        else:

            def tokenize_src(text):
                return [tok.text for tok in src_tok.tokenizer(text)]

        src_tokenizer = tokenize_src

    trg_tokenizer = None
    if args.trg_tok is not None:
        trg_tok = tokenizers[args.trg_tok]
        if args.trg_tok == "ar":

            def tokenize_trg(text):
                return [tok for tok in trg_tok.tokenize(text)]

        else:

            def tokenize_trg(text):
                return [tok.text for tok in tokz.tokenizer(text)]

        trg_tokenizer = tokenize_trg

    if args.task == "translation":
        indices = prep_trans_files(
            args.src_file,
            args.trg_file,
            args.save_path,
            src_tok=src_tokenizer,
            trg_tok=trg_tokenizer,
            max_len=args.max_len,
            min_len=args.min_len,
        )
    elif args.task == "tagging":
        indices = prep_tag_files(
            args.src_file,
            args.save_path,
            src_tok=src_tokenizer,
            max_len=args.max_len,
            min_len=args.min_len,
        )

    train, indices, = train_test_split(indices, test_size=0.3, random_state=42)
    valid, test = train_test_split(indices, test_size=0.5, random_state=42)

    split_to_tsv("train", train, args.save_path)
    split_to_tsv("test", test, args.save_path)
    split_to_tsv("valid", valid, args.save_path)

    # delete temporary files
    os.remove(os.path.join(args.save_path, "temp_src.txt"))
    os.remove(os.path.join(args.save_path, "temp_trg.txt"))
示例#23
0
    tokens = ' '.join(tokens)
    return tokens.strip()


def tokenizeText(sample):
    tokens = parser(sample)
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip(
        ) if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    return tokens


vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1, 1))
parser = Russian()
clf = LogisticRegression()


class CleanTextTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return [cleanup_text(text, False) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self


pipe = Pipeline([('cleanText', CleanTextTransformer()),
                 ('vectorizer', vectorizer), ('clf', clf)])

p = pipe.fit(df['text'].to_list(), df['label'].to_list())
示例#24
0
import os
import sys
import argparse
import codecs
import json

from tqdm import tqdm
from spacy.lang.ru import Russian
from spacy.lang.en import English

from dataset_utils.utils import save_output
from dataset_utils.global_vars import TEXT_FIELDS

TOKENIZERS = {'Russian': Russian().tokenizer, 'English': English().tokenizer}


def main(input_dir: str, output_dir: str, language: str):
    tasks = [task for task in os.listdir(input_dir) if task in TEXT_FIELDS]
    [
        preprocess_task(input_dir, output_dir, t, TOKENIZERS[language])
        for t in tqdm(tasks)
    ]


def preprocess_task(input_dir: str, output_dir, task: str, preproc_fn):
    """ replaces raw texts with preprocessed ones """
    if not os.path.isdir(output_dir + task):
        # create directories for preprocessed tasks
        os.makedirs(output_dir + task)

    samples = [
示例#25
0
print(stem_vectorizer.get_feature_names())



#### Spacy

https://github.com/kmike/pymorphy2


from spacy.lang.ru import Russian
# Requires morph2

import spacy


nlp = Russian()  # use directly


txt = train.description[900]


doc = nlp(txt)

for token in doc:
    print(token.text)

"""c"""

==> Not seeing any results.

示例#26
0
文件: clean.py 项目: rt-chat/chat-api
import nltk

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import WordPunctTokenizer
from spacy.lang.ru import Russian

nltk.download('stopwords')

stopwords = set(stopwords.words('russian'))
stopwords.update([
    '.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '#',
    '№'
])

lemmer = Russian()
stemmer = SnowballStemmer(language='russian')
tokenizer = WordPunctTokenizer()


def filter_words(text: str) -> str:
    tokens = []
    for token in tokenizer.tokenize(text):
        if token not in stopwords and not token.isdigit():
            tokens.append(token)
    return " ".join(tokens)


def lemmatization(text: str) -> str:
    tokens = [token.lemma_ for token in lemmer(text)]
    return " ".join(tokens)
示例#27
0
#!/usr/bin/env python3
from __future__ import unicode_literals
import sys
import spacy.lang.ru
import re
from spacy.lang.ru import Russian
import spacy
corpora_path = 'raw_corpora.txt'

global_word_count = {}
texts_word_count = []

nlp = Russian()
tokenizer = nlp.Defaults.create_tokenizer()
with open(corpora_path, 'r') as f:
    for cnt, line in enumerate(f):
        line = line.strip()
        if line == '/***/':
            texts_word_count.append({})
            continue
        doc = nlp(line)
        for word in doc:
            if re.match('[\w]+', word.text, re.I) == None:
                continue
            if not word.text in global_word_count:
                global_word_count[word.text] = 1
            else:
                global_word_count[word.text] += 1

            if not word.text in texts_word_count[-1]:
                texts_word_count[-1][word.text] = 1
            lemmas_.append(doc[0].lemma_)
    return lemmas_


def tokenize_set(s, lemmas, lang):
    if lemmas:
        return [lemmatize(word_tokenize(instance[0]), lang) for instance in s]
    if not lemmas:
        return [word_tokenize(instance[0]) for instance in s]


def get_y(s):
    return [instance[1] for instance in s]


ru_nlp = Russian()
de_nlp = spacy.load('de_core_news_sm')


def delete_stop_words(lang, tweet):
    doc = ''
    allowed_words = []
    if lang == "rus":
        doc = ru_nlp(tweet)
        allowed_words = ["не"]
    elif lang == "ger":
        doc = de_nlp(tweet)
        allowed_words = [
            "gut", "gute", "guter", "gutes", "kaum", "kein", "keine", "keinem",
            "keinen", "keiner", "nicht", "nichts", "nie", "niemand",
            "niemandem", "niemanden", "schlecht"
示例#29
0
def tokenizer(inp):
    nlp = Russian()
    russian_tokenizer = RussianTokenizer(nlp, MERGE_PATTERNS)
    nlp.add_pipe(russian_tokenizer, name='russian_tokenizer')
    return nlp(inp)
示例#30
0
from spacy.lang.ja import Japanese
from spacy.lang.ca import Catalan
from spacy.lang.eu import Basque

from DataHandler import load_df_twitter_sent, load_df_lorelei
from util import clean_str as test_clean_str
from nltk.corpus import stopwords
from util import identity_fn, lang2id

language_dict = {
    'english': English(),
    'spanish': Spanish(),
    'french': French(),
    'italian': Italian(),
    'german': German(),
    'russian': Russian(),
    'chinese': Chinese(),
    'japanese': Japanese(),
    'catalan': Catalan(),
    'basque': Basque(),
}


class Tokenizer:
    def __init__(self,
                 language,
                 tokenizer_method='spacy',
                 remove_stopwords=True,
                 lowercase=True,
                 strip_accents=None,
                 ngram_range=(1, 1),