def text_cleaner(text):
    negations_dictionary = {
        "isn't": "is not",
        "aren't": "are not",
        "wasn't": "was not",
        "weren't": "were not",
        "haven't": "have not",
        "hasn't": "has not",
        "hadn't": "had not",
        "won't": "will not",
        "wouldn't": "would not",
        "don't": "do not",
        "doesn't": "does not",
        "didn't": "did not",
        "can't": "can not",
        "couldn't": "could not",
        "shouldn't": "should not",
        "mightn't": "might not",
        "mustn't": "must not"
    }
    negations_pattern = re.compile(r'\b(' +
                                   '|'.join(negations_dictionary.keys()) +
                                   r')\b')
    tokenizer = WordPunctTokenizer()
    processed_text = text.lower()
    negation_handled = negations_pattern.sub(
        lambda x: negations_dictionary[x.group()], processed_text)
    processed_text = re.sub("[^A-Za-z]", ' ', negation_handled)
    words = [x for x in tokenizer.tokenize(processed_text) if len(x) > 1]
    return words
예제 #2
0
파일: readers.py 프로젝트: jedimonster/nlp
class NewsgroupsReader(object):
    def __init__(self, tokenize):
        self._tokenize = tokenize
        self._tokenizer = WordPunctTokenizer()

    def get_training(self):
        return self._get_docs('datasets/20news-bydate-train')

    def get_test(self):
        return self._get_docs('datasets/20news-bydate-test')

    def _get_docs(self, path):
        doc_objects = []
        i = 0

        for category in listdir(path):
            for f in listdir(path + "/" + category):
                with codecs.open(path + "/" + category + "/" + f, 'r', encoding='latin1') as content_file:
                    text = content_file.read()
                    tokens = self._tokenizer.tokenize(text) if self._tokenize else text
                    doc_objects.append(Document(i, tokens, category))
                    i += 1

        random.shuffle(doc_objects)
        return doc_objects
예제 #3
0
def filter_stop_words(text, stop_words):
    wpt = WordPunctTokenizer()
    tokenized_words = wpt.tokenize(text)
    processed_words = [word for word in tokenized_words if not word in stop_words]
    text = ' '.join([str(word) for word in processed_words])
    
    return text
예제 #4
0
class PunctTokenizer(object):
    def __init__(self,
                 lower=True,
                 prepend_cls=False,
                 prepend_bos=False,
                 append_eos=False,
                 stopwords=None,
                 specials=SPECIAL_TOKENS):
        self.lower = lower
        self.specials = SPECIAL_TOKENS
        self.pre_id = []
        self.post_id = []
        self.stopwords = stopwords
        if prepend_cls and prepend_bos:
            raise ValueError("prepend_bos and prepend_cls are"
                             " mutually exclusive")
        if prepend_cls:
            self.pre_id.append(self.specials.CLS.value)
        if prepend_bos:
            self.pre_id.append(self.specials.BOS.value)
        if append_eos:
            self.post_id.append(self.specials.EOS.value)
        self.punct = WordPunctTokenizer()

    def __call__(self, x):
        if self.lower:
            x = x.lower()

        x = (self.pre_id + self.punct.tokenize(x) + self.post_id)
        if self.stopwords:
            x = [w for w in x if w not in self.stopwords]
        return x
예제 #5
0
def clean_tweet(tweet):
    link_removed = re.sub('https?://[A-Za-z0-9./]+', '', tweet)
    number_removed = re.sub('[^a-zA-Z]', ' ', link_removed)
    lower_case_tweet = number_removed.lower()
    tok = WordPunctTokenizer()
    words = tok.tokenize(lower_case_tweet)
    clean_tweet = (' '.join(words)).strip()
    return clean_tweet
def sentence2words(sentence):
    result = []
    word_punct_tokenizer = WordPunctTokenizer()
    words = word_punct_tokenizer.tokenize(sentence)
    stemmer = nltk.stem.SnowballStemmer('english')
    for word in words:
        ori_word = stemmer.stem(word)
        result.append(ori_word)
    return result
예제 #7
0
    def _tokenize(self, doc):
        all_tokens = []
        sentences = sent_tokenize(doc)

        tokenizer = WordPunctTokenizer()
        for sentence in sentences:
            words = tokenizer.tokenize(sentence.lower())
            words = [word for word in words if word not in punctuation]
            all_tokens.extend(words)
        return all_tokens
예제 #8
0
def load_task2(articles_path, labels_path, tokenizer='punct'):
    file_names, labels, spans = get_class_labels(labels_path)
    corpus = load_data(articles_path)
    tknz = WordPunctTokenizer()
    samples = []
    for span, file_name in zip(spans, file_names):
        article = corpus[file_name]
        tokenized_span = tknz.tokenize(article[span[0]:span[1]])
        samples.append(tokenized_span)
    return samples, labels, spans, file_names
def _sentence_tok(delex_texts: List[str]) -> List[List[List[str]]]:
    #tokenize the texts
    sentence_tok_texts = []
    tknzr = WordPunctTokenizer()
    for text in delex_texts:
        sentences = sent_tokenize(text)
        tok_sentences = []
        for sentence in sentences:
            tok_sentences.append(tknzr.tokenize(sentence))
        sentence_tok_texts.append(tok_sentences)

    return sentence_tok_texts
예제 #10
0
class CustomTokenizer:
    def __init__(self, unicode_to_ascii=True, punct_one_token_per_char=True):
        self.unicode_to_ascii = unicode_to_ascii
        self.punct_one_token_per_char = punct_one_token_per_char

        self._re_punct = re.compile("(\p{P})")
        self._tokenizer = WordPunctTokenizer()

    def tokenize(self, text):
        if self.unicode_to_ascii:
            text = unidecode(text)
        if self.punct_one_token_per_char:
            text = re.sub(self._re_punct, "\\1 ", text)
        return self._tokenizer.tokenize(text)
예제 #11
0
def stemming_words(text):    
    wpt = WordPunctTokenizer()
    words = wpt.tokenize(text)
    
    turkishStemmer = TurkishStemmer()
    
    stemmed_words = []
    for word in words:
        stemmed_words.append(turkishStemmer.stemWord(word))
    text = ' '.join([str(word) for word in stemmed_words])  
    
#     print (stemmed_words)
    
    return text 
예제 #12
0
 def stemming_words(self, text):
     wpt = WordPunctTokenizer()
     words = wpt.tokenize(text)
     turkishStemmer = TurkishStemmer()
     stemmed_words = []
     for word in words:
         stemmed_words.append(turkishStemmer.stemWord(word))
         # try:
         #     # stemmed_words.append(turkishStemmer.stemWord(word))
         #     stemmed_words.append(word[0:5])
         # except:
         #     # stemmed_words.append(turkishStemmer.stemWord(word))
         #     stemmed_words.append(word)
     text = ' '.join([str(word) for word in stemmed_words])
     return text
예제 #13
0
class TolstojParser(HTMLParser):

    def __init__(self):
        HTMLParser.__init__(self)
        self.inside_dd = False
        self.bgrams = {}
        self.sorted_bgrams = []
        self.tokenizer = WordPunctTokenizer()
        self.token_count = 0


    def handle_starttag(self, tag, attrs):
        if tag == "dd":
            self.inside_dd = True
        else:
            self.inside_dd = False


    def handle_data(self, data):
        if self.inside_dd:
            tokens = self.tokenizer.tokenize(unicode(data, 'utf-8').lower())
            for t1, t2 in itertools.izip(tokens, tokens[1:]):
                self.token_count += 1

                if (t1[0] in string.punctuation) or (t2[0] in string.punctuation):
                    continue

                key = t1.encode('utf-8') + ' ' + t2.encode('utf-8')
                if self.bgrams.has_key(key):
                    self.bgrams[key] += 1
                else:
                    self.bgrams[key] = 1

    def dump_bgrams(self, output_name):
        output = open(output_name, 'wb')
        pickle.dump(self.bgrams, output)
        output.close()


    def make_sorted_bgrams(self):
        self.sorted_bgrams = sorted(self.bgrams.items(), key=lambda x: x[1], reverse=True)

    def print_sorted_bgrams(self):
        for key, count in self.sorted_bgrams:           
            print key, count
예제 #14
0
def get_average_embedding(embedding, review):
    """
    returns a list of word vectors for all words in review
    then average them to return a final vector

    :param embedding: embedding object - will be either Fasttext or Word2Vec
    :param review: review text
    :return:
    """
    log.debug(f'Getting average embedding for: [{review}]')

    wpt = WordPunctTokenizer()
    # word_vectors = [embedding.wv.get_vector(word) for word in wpt.tokenize(review)]
    word_vectors = [embedding.wv.get_vector(word) for word in wpt.tokenize(review) if word in embedding.wv.vocab]
    log.debug(f'word_vector shape [{np.shape(word_vectors)}]')

    # return average all word vectors to come up with final vector for the review
    # since we are using pre-trained embedding, we may not be able to find all the words
    if np.shape(word_vectors)[0] > 1:
        return np.average(word_vectors, axis=0)
    return None
예제 #15
0
def generate_fasttext_file(x: pd.DataFrame,
                           y: pd.Series,
                           description: str,
                           feature_column: str,
                           timer: Timer = None,
                           feature_size: int = 100,
                           window_context: int = 5,
                           min_word_count: int = 5,
                           sample: float = 0.001,
                           iterations: int = 5,
                           ):
    """
    generate features using fasttext embedding

    https://radimrehurek.com/gensim/models/fasttext.html

    :param x:
    :param y:
    :param description:
    :param feature_size: Dimensionality of the word vectors
    :param window_context: The maximum distance between the current and predicted word within a sentence
    :param min_word_count: The model ignores all words with total frequency lower than this
    :param sample: The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5).
    :param iterations: Number of iterations (epochs) over the corpus
    :return:
    """

    log.info("generating fasttext")
    log.debug(f'{x.head()}')
    wpt = WordPunctTokenizer()

    if timer:
        timer.start_timer(TOKENIZE_TIME_MIN)
    documents = [wpt.tokenize(review) for review in x.array]
    if timer:
        timer.end_timer(TOKENIZE_TIME_MIN)

    if timer:
        timer.start_timer(VECTORIZE_TIME_MIN)


    # TODO: add in configuration for pre-trained
    # if x.shape[0] <= 50:
    ft_model = FastText(documents,
                        size=int(feature_size),
                        window=int(window_context),
                        min_count=int(min_word_count),
                        sample=sample,
                        iter=int(iterations)
                        )

    # else:
    #     log.info("Download pre-trained fasttext")
    #     ft_model = FastText.load_fasttext_format('wiki.simple')
    if timer:
        timer.end_timer(VECTORIZE_TIME_MIN)

    model_file = f"{MODEL_DIR}/{description}-{len(x)}-{feature_size}.model"
    log.info(f'Writing model file: {model_file}')
    if timer:
        timer.start_timer(MODEL_SAVE_TIME_MIN)
    ft_model.save(model_file)
    if timer:
        timer.end_timer(MODEL_SAVE_TIME_MIN)

    if timer:
        timer.start_timer(FEATURE_TIME_MIN)
    feature_df = get_feature_df(ft_model, x)
    if timer:
        timer.end_timer(FEATURE_TIME_MIN)

    return write_to_file(feature_df, y, feature_column, description, include_lda=False)
예제 #16
0
class KareninaParser(HTMLParser):

    def __init__(self):
        HTMLParser.__init__(self)
        self.inside_dd = False
        self.doc_id = 0
        self.token_count = 0
        self.token_sum_len = 0      
        self.iindex = {}
        self.paragraphs = []
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = RussianStemmer()


    def handle_starttag(self, tag, attrs):
        if tag == "dd":
            self.inside_dd = True
            self.doc_id += 1
        else:
           self.inside_dd = False


    def handle_data(self, data):
        if self.inside_dd:
            self.paragraphs.append(data)
            terms = set()
            for token in self.tokenizer.tokenize(unicode(data.lower(), 'utf-8')):
                if token[0] in string.punctuation:
                    continue

                self.token_count += 1
                self.token_sum_len += len(token)                   

                term = self.stemmer.stem(token)                  

                if not term in terms:
                    terms.add(term)
                    if self.iindex.has_key(term):
                        self.iindex[term].append(self.doc_id)
                    else:
                        self.iindex[term] = [ self.doc_id ]


    def dump_iindex(self, output_name):
        output = open(output_name, 'wb')
        pickle.dump(self.iindex, output)
        output.close()


    def dump_paragraphs(self, output_name):
        output = open(output_name, 'wb')
        pickle.dump(self.paragraphs, output)
        output.close()


    def get_stat(self):
        term_sum_len = 0
        for term in self.iindex.keys():
            term_sum_len += len(term)

        term_count = len(self.iindex.keys())
        
        if not (term_count and self.token_count):
            self.stat = {}

        else:
            self.stat = {
                'token_count': self.token_count,
                'token_avg_len': self.token_sum_len/float(self.token_count),
                'term_count': term_count,
                'term_avg_len': term_sum_len/float(term_count)
            }

        return self.stat


    def print_iindex(self):
        for term in sorted(self.iindex.keys()):
            posting_list = self.iindex[term]
            print term
            print len(posting_list)
            print posting_list
            print '---------------------'
예제 #17
0
class Decompounder(object):
    """Word decompunder."""
    def __init__(self):
        """Set up map."""
        self.word_tokenizer = WordPunctTokenizer()

        filename = join(split(__file__)[0], 'data', 'compounds.txt')

        self.decompound_map = {}
        with open(filename, encoding='utf-8') as fid:
            for line in fid:
                parts = line.strip().split('|')
                compound = "".join(parts)
                decompounded_parts = [
                    part for part in parts if part != 's' and part != 'e'
                ]
                decompounded = " ".join(decompounded_parts)
                self.decompound_map[compound] = decompounded

    def decompound_text(self, text):
        """Return decompounded text.

        Parameters
        ----------
        text : str
            Text as a (unicode) str.

        Returns
        -------
        decompounded : str
            String with decompounded parts separated by a whitespace.

        Examples
        --------
        >>> decompounder = Decompounder()
        >>> text = 'Det er en investeringsvirksomhed'
        >>> decomp = decompounder.decompound_text(text)
        >>> decomp == 'det er en investering virksomhed'
        True

        """
        tokens = self.word_tokenizer.tokenize(text)
        return " ".join(
            self.decompound_word(token.lower()) for token in tokens)

    def decompound_word(self, word):
        """Return decompounded word.

        Parameters
        ----------
        word : str
            Word as a (unicode) str.

        Returns
        -------
        decompounded : str
            String with decompounded parts separated by a whitespace.

        Examples
        --------
        >>> decompounder = Decompounder()
        >>> decomp = decompounder.decompound_word('investeringsvirksomhed')
        >>> decomp == 'investering virksomhed'
        True

        """
        return self.decompound_map.get(word, word)
예제 #18
0
 def sentence2words(self, sentence):
     result = []
     word_punct_tokenizer = WordPunctTokenizer()
     words = word_punct_tokenizer.tokenize(sentence)
     return words
예제 #19
0
ngrams = []
n, m = 0, 0
t = int(time())
l = len(messages)

for message in messages:
    if message == "<|BEGIN|>":
        ngram = []
    elif message == "<|END|>":
        phrases = []
        for phrase in ngram:
            terms = set(te(phrase, strings=1, nested=1))
            words = list(
                set([ma.parse(w)[0].normal_form
                     for w in wpt.tokenize(phrase)]))
            idx = []
            for word in words:
                w = 1 if word in terms else .5
                idx += [(w, word)]
            phrases += [(idx, phrase)]
        ngrams += [phrases]
    else:
        ngram += [message]
    n += 1
    if time() - t > 1:
        print("%s of %s, %s / sec" % (m, l, n))
        m += n
        n = 0
        t = int(time())
예제 #20
0
class FeatureExtractor(BaseEstimator):
    """Feature extractor for Danish texts."""
    def __init__(self):
        """Set up text processors."""
        self.afinn = Afinn(language='da')
        self.word_tokenizer = WordPunctTokenizer()

    def partial_fit(self, Y, y=None):
        """Fit model.

        This is a dummy function.

        """
        return self

    def fit(self, X, y=None):
        """Fit model.

        This is a dummy function.

        """
        return self

    @property
    def features_(self):
        """Set up features."""
        features = [
            'n_characters', 'n_words', 'n_unique_words', 'afinn_sum_valence',
            'afinn_sum_arousal', 'afinn_sum_ambiguity'
        ]
        return features

    def transform(self, raw_documents, y=None):
        """Transform documents to features.

        Parameters
        ----------
        raw_documents : iterable over str
            Iterable with corpus to be transformed.
        y : numpy.array
            Target (not used, dummy parameter).

        """
        X = []
        for n, document in enumerate(raw_documents):
            words = self.word_tokenizer.tokenize(document)
            unique_words = set(words)
            scores = self.afinn.scores(document)
            sum_valence = sum(scores)
            sum_arousal = np.sum(np.abs(scores))

            X.append([
                len(document),
                len(words),
                len(unique_words), sum_valence, sum_arousal,
                sum_arousal - abs(sum_valence)
            ])

        X = np.array(X)
        return X

    fit_transform = transform
예제 #21
0
def generate_word2vec_file(x: pd.DataFrame,
                           y: pd.Series,
                           description: str,
                           feature_column: str,
                           timer: Timer = None,
                           feature_size: int = 100,
                           window_context: int = 5,
                           min_word_count: int = 5,
                           sample: float = 0.001,
                           iterations: int = 5,
                           ):
    """
    generate features using word2vec
    :param x:
    :param y:
    :param description:
    :param feature_size:
    :param window_context:
    :param min_word_count:
    :param sample:
    :param iterations:
    :return:
    """
    log.info("generating word2vec")
    log.debug(f'{x.head()}')
    wpt = WordPunctTokenizer()

    if timer:
        timer.start_timer(TOKENIZE_TIME_MIN)
    documents = [wpt.tokenize(review) for review in x.array]
    if timer:
        timer.end_timer(TOKENIZE_TIME_MIN)

    if timer:
        timer.start_timer(VECTORIZE_TIME_MIN)

    # TODO: add configuraton for pre-trained or train
    # if x.shape[0] <= 50:
    w2v_model = Word2Vec(documents,
                         size=int(feature_size),
                         window=int(window_context),
                         min_count=int(min_word_count),
                         sample=sample,
                         iter=int(iterations)
                         )
    # else:
    #     log.info("Downloading pre-trained word2vec")
    #     w2v_model = api.load("word2vec-google-news-300")
    if timer:
        timer.end_timer(VECTORIZE_TIME_MIN)


    model_file = f"{MODEL_DIR}/{description}-{len(x)}-{feature_size}.model"
    log.info(f'Writing model file: {model_file}')
    if timer:
        timer.start_timer(MODEL_SAVE_TIME_MIN)
    w2v_model.save(model_file)
    if timer:
        timer.end_timer(MODEL_SAVE_TIME_MIN)

    feature_df = get_feature_df(w2v_model, x)
    return write_to_file(feature_df, y, feature_column, description, include_lda=False)
예제 #22
0
    def evaluate(self):

        path = self.dataset.getPath()
        try:
            features = pickle.load(open(f"{path}/preprocessed.p", "rb"))
        except:
            features = self.processor.process()
            pickle.dump(features, open(f"{path}/preprocessed.p", "wb"))
        word_punctuation_tokenizer = WordPunctTokenizer()
        word_tokenized_corpus = [
            word_punctuation_tokenizer.tokenize(sent) for sent in features]
        # print(word_tokenized_corpus)
        embedding_size = 64
        window_size = 3
        min_word = 5
        down_sampling = 1e-2
        ft_model = FastText(word_tokenized_corpus,
                            size=embedding_size,
                            window=window_size,
                            min_count=min_word,
                            sample=down_sampling,
                            sg=1,
                            iter=100)
        # pickle.dump(ft_model, open("ft_model.p", "wb"))
        # ft_model = pickle.load(open("ft_model.p", "rb"))
        # print(ft_model.wv['gün'])
        embedding_matrix = np.zeros((len(ft_model.wv.vocab) + 1, 64))
        for i, vec in enumerate(ft_model.wv.vectors):
            embedding_matrix[i] = vec
        vocab_size = len(ft_model.wv.vocab)+1
        # semantically_similar_words = {words: [item[0] for item in ft_model.wv.most_similar(
        #     [words], topn=5)]for words in ['gün', 'katil', 'ekonomi', 'haber', 'başbakan', 'siyaset']}

        # for k, v in semantically_similar_words.items():
        #     print(k+":"+str(v))
        # # print(ft_model.wv.similarity(w1='siyaset', w2='futbol'))
        # from sklearn.decomposition import PCA

        # all_similar_words = sum(
        #     [[k] + v for k, v in semantically_similar_words.items()], [])

        # # print(all_similar_words)
        # # print(type(all_similar_words))
        # # print(len(all_similar_words))

        # word_vectors = ft_model.wv[all_similar_words]

        # pca = PCA(n_components=2)

        # p_comps = pca.fit_transform(word_vectors)
        # word_names = all_similar_words

        # plt.figure(figsize=(18, 10))
        # plt.scatter(p_comps[:, 0], p_comps[:, 1], c='red')

        # for word_names, x, y in zip(word_names, p_comps[:, 0], p_comps[:, 1]):
        #     plt.annotate(word_names, xy=(x+0.06, y+0.03),
        #                  xytext=(0, 0), textcoords='offset points')
        # plt.show()

        labels = self.dataset.getClasses()
        le = preprocessing.LabelEncoder()
        labels = le.fit_transform(labels)
        labels = to_categorical(labels)
        return self.ft_model(features, labels, embedding_matrix,
                             vocab_size, ft_model)
예제 #23
0
def tokenize(text_array, use_pos=False, data_type=None, lang=None):
    """
    Given an array of sentences, returns:
        If use_pos:
            An array of tokenised sentences (where each tokenised sentence is an array of tokens) 
        else:
            An array of tokenised sentences (where each tokenised sentence is an array of tuples of (token, POS tag))
    NOTE: If use_pos is False, the rest of the kwargs are ignored
    """

    if use_pos:
        # Since POS tags take long to generate, use cached version if exists

        cache_path = None

        if data_type == DatasetType.TRAIN:
            cache_path = os.path.join(SAVED_POS_BASE_PATH,
                                      f'train-{lang}-pos.pickle')
        elif data_type == DatasetType.VAL:
            cache_path = os.path.join(SAVED_POS_BASE_PATH,
                                      f'val-{lang}-pos.pickle')
        elif data_type == DatasetType.TEST:
            cache_path = os.path.join(SAVED_POS_BASE_PATH,
                                      f'test-{lang}-pos.pickle')

        if os.path.isfile(cache_path):
            with open(cache_path, 'rb') as handle:
                sentences = pickle.load(handle)
            return sentences

    tokeniser = WordPunctTokenizer()

    sentences = []
    with tqdm(total=len(text_array)) as pbar:
        for sentence in text_array:
            tokens = tokeniser.tokenize(sentence)
            lower_cased_tokens = []
            for tok in tokens:
                tok_lower = tok.lower()
                lower_cased_tokens.append(tok_lower)

            if use_pos:
                # Store tokenised sentence i.e. arrays of (token, POS_TAG) tuples
                try:
                    sentences.append(get_pos_tags(lower_cased_tokens, lang))
                except:
                    sentences.append([
                        get_pos_tags([tok], lang)[0]
                        for tok in lower_cased_tokens
                    ])
            else:
                # Store tokenised sentence
                sentences.append(lower_cased_tokens)
            pbar.update(1)

    if use_pos:
        # Store POS tags to allow faster loading on next invocation
        with open(cache_path, 'wb') as handle:
            pickle.dump(sentences, handle)

    return sentences
예제 #24
0
class SentenceReader:
    def __init__(self,
                 thesaurus,
                 need_deeppavlov=True,
                 deeppavlov_model=None,
                 need_syntax=True,
                 syntax_model=None):
        self.need_deeppavlov = need_deeppavlov

        if need_deeppavlov:
            self.deeppavlov_lemma = deeppavlov_model if deeppavlov_model else build_model(
                configs.morpho_tagger.BERT.morpho_ru_syntagrus_bert,
                download=False)

        if need_syntax:
            self.syntax_model = syntax_model if syntax_model else build_model(
                configs.syntax.syntax_ru_syntagrus_bert, download=False)
        else:
            self.syntax_model = None

        self.tokenizer = WordPunctTokenizer()
        self.thesaurus = thesaurus

    def process_file(self, filename, verbose=False):
        tagged_lemmas = []
        initial_sentences = []

        # Stats for output
        broken_sentences = 0
        failed_lemmatize = 0

        with open(filename) as tagged_file:
            current_sentence_tokens = []
            current_sentence_lemmas = []
            need_append = False

            for line in tagged_file.readlines():
                if line.startswith("# sent_id"):
                    need_append = True
                elif line.startswith("# text"):
                    continue
                elif len(line) < 2:
                    sentences_lemma_divided = self.divide_tagged(
                        current_sentence_lemmas)
                    sentence_initial_divided = self.divide_tagged(
                        current_sentence_tokens)

                    tagged_lemmas += sentences_lemma_divided
                    initial_sentences += sentence_initial_divided
                    broken_sentences += (len(sentences_lemma_divided) - 1)

                    need_append = False
                    current_sentence_tokens = []
                    current_sentence_lemmas = []
                else:
                    if need_append:
                        line_splitted = line.split('\t')
                        current_sentence_tokens.append(
                            line_splitted[1].lower())
                        current_sentence_lemmas.append(
                            line_splitted[2].lower())

        parsed_sentences = []

        for init_tokens, lemma_tokens in zip(initial_sentences, tagged_lemmas):
            deeppavlov_lemma = None
            deeppavlov_pos = None

            if self.need_deeppavlov:
                try:
                    deeppavlov_lemma, deeppavlov_pos = self.get_deeppavlov_info(
                        init_tokens)
                except:
                    failed_lemmatize += 1
                    deeppavlov_lemma = None
                    deeppavlov_pos = None

            parsed_sentences.append(
                ParsedSentence(init_tokens, lemma_tokens, self.thesaurus,
                               deeppavlov_lemma, deeppavlov_pos,
                               self.syntax_model))

        if verbose:
            print("Processed {}. Recovered {} sentences, lost {} too long".
                  format(filename, broken_sentences, failed_lemmatize))

        return parsed_sentences

    def process_directory(self, dir_path, verbose=False):
        text_names = listdir(dir_path)
        all_sentences = []

        for filename in text_names:
            full_path = join(dir_path, filename)
            parsed_sentences = self.process_file(full_path)
            all_sentences += parsed_sentences

        return all_sentences

    def divide_tagged(self, tagged_sentence):
        single_sentence = " ".join(tagged_sentence)
        sentence_parts = single_sentence.split(".")
        return [
            self.tokenizer.tokenize(part) + ["."] for part in sentence_parts
            if len(part) > 0
        ]

    def get_deeppavlov_info(self, tagged_sentence):
        sentences = [tagged_sentence]
        morpho_tokens = self.deeppavlov_lemma(sentences)[0].split('\n')
        splitted_info = [x.split('\t') for x in morpho_tokens]
        lemmatized_tokens = [
            splitted[2] for splitted in splitted_info if len(splitted) == 10
        ]
        pos = [
            splitted[3] for splitted in splitted_info if len(splitted) == 10
        ]
        return lemmatized_tokens, pos
예제 #25
0
from nltk import PunktSentenceTokenizer, WordPunctTokenizer
from collections import Counter

vocab_size = 1000

sentTokenier = PunktSentenceTokenizer()
wordTokenizer = WordPunctTokenizer()

filename = 'data/formatted_movie_lines.txt'
string = open(filename, mode='r', encoding='utf8').read()
string = string.replace("'t", "")
string = string.replace("'s", "")

words = wordTokenizer.tokenize(string)
sentences = set(sentTokenier.tokenize(string))

vocab = Counter(words).most_common(vocab_size)
dict = Counter(vocab)
sentences = [wordTokenizer.tokenize(sentence) for sentence in sentences]

new_sentences = []
with open("lines.txt", mode='w', encoding='utf8') as file:
    for sentence in sentences:
        write = True
        for word in sentence:
            if word in dict.keys():
                write = False
                break
        if write:
            file.writelines(" ".join(sentence) + "\n")
            new_sentences.append(sentence)
예제 #26
0
from gensim import corpora, models, similarities
from nltk import WordPunctTokenizer
import re

NUM_TOPICS = 40

stopwords = open('stopwords.txt').read().split('\n')
word_re = re.compile('[a-z0-9\s]+')
tokenizer = WordPunctTokenizer()
tokenize = lambda text: [w.lower()
                         for w in tokenizer.tokenize(text)
                         if re.match(word_re, w.lower()) and w.lower() not in stopwords]

id2word = corpora.Dictionary.load('dictionary.dict')
mm = corpora.MmCorpus('tfidf.mm')
lsi = models.lsimodel.LsiModel(corpus=mm, id2word=id2word, num_topics=NUM_TOPICS)
dic = corpora.Dictionary.load('dictionary.dict')

def get_topics(text, num, model=lsi):
    """ get +num+ topics for text +text+ """
    topics = []

    for t in sorted(model[dic.doc2bow(tokenize(text))],
                    key=lambda t: t[1],
                    reverse=True)[:num]:

        topics.append([u[1] for u in lsi.show_topic(t[0])])

    return topics
예제 #27
0
def finder(query):
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    stop_words = [
        "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you",
        "your", "yours", "yourself", "yourselves", "he", "him", "his",
        "himself", "she", "her", "hers", "herself", "it", "its", "itself",
        "they", "them", "their", "theirs", "themselves", "what", "which",
        "who", "whom", "this", "that", "these", "those", "am", "is", "are",
        "was", "were", "be", "been", "being", "have", "has", "had", "having",
        "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if",
        "or", "because", "as", "until", "while", "of", "at", "by", "for",
        "with", "about", "against", "between", "into", "through", "during",
        "before", "after", "above", "below", "to", "from", "up", "down", "in",
        "out", "on", "off", "over", "under", "again", "further", "then",
        "once", "here", "there", "when", "where", "why", "how", "all", "any",
        "both", "each", "few", "more", "most", "other", "some", "such", "no",
        "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s",
        "t", "can", "will", "just", "don", "should", "now"
    ]
    dataset = pd.read_csv("abs.csv")
    data = dataset[['Medline_No', 'Abstract']]
    from nltk import WordPunctTokenizer
    tokenizer = WordPunctTokenizer()
    descriptions = [
        tokenizer.tokenize(description.lower())
        for description in data["Abstract"] if not description in stop_words
    ]

    from gensim import corpora
    corpora_dict = corpora.Dictionary(descriptions)

    #print(corpora_dict.token2id)

    #Basic model which needs improvment
    corpus = [corpora_dict.doc2bow(text) for text in descriptions]
    from gensim import similarities
    index_bow = similarities.SparseMatrixSimilarity(
        corpus, num_features=len(corpora_dict))
    from gensim.models import TfidfModel
    model_tfidf = TfidfModel(corpus)
    vector = model_tfidf[corpus[0]]
    corpus_tfidf = model_tfidf[corpus]
    index_tfidf = similarities.SparseMatrixSimilarity(
        corpus_tfidf, num_features=len(corpora_dict))

    def search(index, query, top_n=10, prints=False):
        """
        This function searches the most similar texts to the query.
            :param index: gensim.similarities object
            :param query: a string
            :param top_n: how many variants it returns
            :param prints: if True returns the results, otherwise prints the results
            :returns: a list of tuples (matched_document_index, similarity_value)
        """
        # getting a BoW vector
        bow_vec = corpora_dict.doc2bow(query.lower().split())
        similarities = index[
            bow_vec]  # get similarities between the query and all index documents
        similarities = [(x, i) for i, x in enumerate(similarities)]
        similarities.sort(key=lambda elem: -elem[0]
                          )  # sorting by similarity_value in decreasing order
        res = []
        if prints:
            print(f"{query}\n")
        for result in similarities[:top_n]:
            if prints:
                #print(f"{data['main_speaker'][result[1]]} \n{data['description'][result[1]]} \t {result[0]} \t \n")
                if result[0] > 0:
                    dic = {
                        'Medline_No': data['Medline_No'][result[1]],
                        'Title': data_dict_title[data['Medline_No'][result[1]]]
                    }
                    myresult.append(dic)
            else:
                res.append((result[1], result[0]))
        if not prints:
            return res

    myresult = []
    search(index_tfidf, query, prints=True)
    return myresult
예제 #28
0
class DataPreparator:
    def __init__(self, input_size, batch_size, path_to_write):
        self.word_vectors = FastText.load("D:\\Typing\\araneum_none_fasttextskipgram_300_5_2018.model")
        self.input_size = input_size
        self.tokenizer = WordPunctTokenizer()
        self.batch_size = batch_size
        self.path = path_to_write
        self.punctuations = ['.', ',', '-', '\'', '\"', '!', '?', '(', ')', ':', ';']

    def define_word_vector(self):
        num = 0
        dir = "D:\\Typing\\texts\\"
        prefix = "{http://www.gribuser.ru/xml/fictionbook/2.0}"
        files = os.listdir(dir)
        inputs = []
        outputs = []
        for file in files:
            tree = ET.parse(dir + file)
            root = tree.getroot()
            for child in root.iter(prefix + 'p'):
                text = child.text
                if text is None:
                    continue
                for line in text.split("."):
                    for char in line:
                        if char in self.punctuations:
                            line = line.replace(char, '')
                    words = self.tokenizer.tokenize(line)
                    for i in range(len(words)-5):
                        try:
                            input = (self.word_vectors[words[i]], self.word_vectors[words[i+1]], self.word_vectors[words[i+2]])
                            output = (self.word_vectors[words[i+3]])
                        except KeyError:
                            continue
                        inputs.append(input)
                        outputs.append(output)
                        if len(outputs) == self.batch_size:
                            with open(self.path + str(num), 'w') as f:
                                for k in range(len(outputs)):
                                    f.write(self.vectors_to_string(inputs[k])+':'+self.vectors_to_string(outputs[k])+'\n')
                                print str(num)
                                num += 1
                                inputs = []
                                outputs = []

    def define_freq_word(self, n=1000):
        num = 0
        self.freq_words = self.load_freq_words(n)
        dir = "D:\\Typing\\texts\\"
        prefix = "{http://www.gribuser.ru/xml/fictionbook/2.0}"
        files = os.listdir(dir)
        inputs = []
        outputs = []
        for file in files:
            tree = ET.parse(dir + file)
            root = tree.getroot()
            for child in root.iter(prefix + 'p'):
                text = child.text
                if text is None:
                    continue
                for line in text.split("."):
                    for char in line:
                        if char in self.punctuations:
                            line = line.replace(char, '')
                    words = self.tokenizer.tokenize(line)
                    for i in range(len(words) - 5):
                        if words[i+3] in self.freq_words.keys():
                            try:
                                input = (self.word_vectors[words[i]], self.word_vectors[words[i + 1]],
                                         self.word_vectors[words[i + 2]])
                            except KeyError:
                                continue
                            output = np.zeros(n)
                            output[self.freq_words[words[i+3]]] = 1
                            inputs.append(input)
                            outputs.append(output)
                            if len(outputs) == self.batch_size:
                                with open(self.path + str(num), 'w') as f:
                                    for k in range(len(outputs)):
                                        f.write(self.vectors_to_string(inputs[k]) + ':' + self.vectors_to_string(
                                            outputs[k]) + '\n')
                                    print str(num)
                                    num += 1
                                    inputs = []
                                    outputs = []
                                    if num == 85000:
                                        return

    def load_freq_words(self, n):
        words = {}
        counter = 0
        with io.open('D:\\Typing\\freq_words.txt', 'r', encoding='utf-8') as f:
            w = f.read().split('\n')
            for word in w:
                if counter < n:
                    words[word] = counter
                    counter += 1
                else:
                    return words

    def count_freq_words(self):
        dir = "D:\\Typing\\texts_1\\"
        prefix = "{http://www.gribuser.ru/xml/fictionbook/2.0}"
        files = os.listdir(dir)
        counter = Counter()
        n = 0
        for file in files:
            print str(n)
            n += 1
            tree = ET.parse(dir + file)
            root = tree.getroot()
            for child in root.iter(prefix + 'p'):
                text = child.text
                if text is None:
                    continue
                for line in text.split("."):
                    for char in line:
                        if char in self.punctuations:
                            line = line.replace(char, '')
                    words = self.tokenizer.tokenize(line)
                    for word in words:
                        counter[word.lower()] +=1
        with io.open('D:\\Typing\\freq_words.txt', 'w', encoding='utf-8') as f:
            for w in counter.most_common(len(counter)):
                f.write(w[0]+u'\n')


    def vectors_to_string(self, vectors):
        s = ''
        if isinstance(vectors, tuple):
            for vector in vectors:
                for element in vector:
                    s += str(element) + ','
        else:
            for element in vectors:
                s += str(element) + ','
        return s[:len(s) - 1]
예제 #29
0
from dasem.fullmonty import Word2Vec
from dasem.text import Decompounder

from cvrminer.cvrmongo import CvrMongo
from cvrminer.text import PurposeProcessor
from cvrminer.virksomhed import Virksomhed

# Ignore broken pipe errors
signal.signal(signal.SIGPIPE, signal.SIG_DFL)

decompounder = Decompounder()
purpose_processor = PurposeProcessor()
w2v = Word2Vec()
word_tokenizer = WordPunctTokenizer()

n = 1
cvr_mongo = CvrMongo()
for company in cvr_mongo.iter_companies():
    virksomhed = Virksomhed(company)
    purposes = virksomhed.formaal
    for purpose in purposes:
        cleaned_purpose = purpose_processor.clean(purpose)
        words = word_tokenizer.tokenize(cleaned_purpose)
        for word in words:
            word = word.lower()
            if word not in w2v.model:
                phrase = decompounder.decompound_word(word)
                for subphrase in phrase.split(' '):
                    if subphrase not in w2v.model:
                        write(1, subphrase.encode('utf-8') + b('\n'))