예제 #1
0
def clean_text(df_input,
               col='content',
               remove_unusual=False,
               remove_stopwords=False,
               toRemove=[],
               remove_numbers=False,
               stem_words=False,
               lemmatize=False,
               nGram=False):
    # Clean mails
    if remove_stopwords:
        toRemove.extend(stopWords())
    usual_words = []
    if remove_unusual:
        usual_words = usualWords()
    ## Clean content of mails
    # tokenization and lemmatization/stemming
    df_input[col] = df_input[col].map(
        lambda x: text_to_words(x, remove_numbers, stem_words, lemmatize))
    # removing stopwords and unusual words
    df_input[col] = df_input[col].map(lambda x: remove_words(
        x, remove_unusual, remove_stopwords, toRemove, usual_words))
    # bigrams and trigrams
    if nGram:
        phrase = phrases.Phrases(df_input[col], min_count=30, threshold=300)
        bigram = phrases.Phraser(phrase)
        trigram = phrases.Phrases(bigram[df_input[col]])
        df_input[col] = [trigram[bigram[sent]] for sent in df_input[col]]

    print("data cleaned")
    return df_input
예제 #2
0
def main(args):
    """Reads csv data file containing sentences, tokenizes and uses them to train word2vec model"""
    data = pd.read_csv(args['data_csv'], index_col=0)

    # tokenize and preprocess sentences
    sentences = [
        stripword(row.translate(translator).lower).split(' ')
        for row in data['Sentence']
    ]

    # create bigrams to capture word combinations (e.g. New_York)
    bigram_transformer = phrases.Phrases(sentences)
    bigram = phrases.Phraser(bigram_transformer)

    # train word2vec model according to the hyperparameters chosen
    currentmodel = Word2Vec(bigram[sentences],
                            workers=-1,
                            sg=0,
                            size=args['model_size'],
                            min_count=5,
                            window=['window_size'],
                            sample=1e-3)
    currentmodel.init_sims(replace=True)
    currentmodel.save("app/word2vec/word2vec_retrained")
    print('Saved as app/word2vec/word2vec_retrained')
예제 #3
0
def make_bigrams(data_words, texts):
    
    from gensim.models import Phrases
    from gensim.models import phrases
    
    bigram = Phrases(data_words, min_count=5, threshold=100)
    bigram_mod = phrases.Phraser(bigram)
    
    return [bigram_mod[doc] for doc in texts]
def train_phrase():
    sentence_stream = list()
    for doc in documentList:
        wordlist = doc.split(" ")
        sentence_stream.append(wordlist)

    ps = phrase.Phrases(sentence_stream)
    bigram = phrase.Phraser(ps)
    return bigram
예제 #5
0
def phrase_detection(df):
    """ Given the emails dataframe, form bigrams based on the text in "Body" field """
    sentences = [text.split() for text in df["Body"]]
    phrases_ = phrases.Phrases(sentences,
                               min_count=params.bigrams_min_count,
                               threshold=params.bigrams_threshold)
    bigram = phrases.Phraser(phrases_)
    # for phr, score in phrases_.export_phrases(sentences):
    #     print(u'{0}   {1}'.format(phr, score))
    return bigram
예제 #6
0
def make_bigram(dirpaths):
    sentences = corpora(dirpaths, loop_or_not=False)
    print('Start phrasing:')
    phrase = phrases.Phrases(sentences,
                             max_vocab_size=DICTLENGTH,
                             min_count=1,
                             threshold=5,
                             common_terms={'of', 'and', 'the', 'with'})
    bigram = phrases.Phraser(phrase)
    bigram.save(SAVED_BIGRAM_PATH)
    print('bigram phraser saved conclude.')
예제 #7
0
  def __init__(self, model_path, create=False, corpus=None, bigrams=True):
    """
    Initializes the rewriter, given a particular Word2Vec corpus.
    A good example corpus is the Wikipedia Text8Corpus.
    You only need the corpus if you are recreating the model from scratch.

    If ``create == True``, this generates a new Word2Vec
    model (which takes a really long time to build.) If ``False``, this loads
    an existing model we already saved.

    :param str model_path: where to store the model files. This file
        needn't exist, but its parent folder should.
    :param bool create: True to create a new Word2Vec model, False to
        use the one stored at ``model_path``.
    :param Iterable corpus: only needed if ``create=True``. Defines a corpus
        for Word2Vec to learn from.
    :param bool bigrams: only needed if ``create=True``. If True, takes some
        more time to build a model that supports bigrams (e.g. `new_york`).
        Otherwise, it'll only support one-word searches. ``bigram=True`` makes
        this slower but more complete.
    """

    self.model_path = model_path

    # TODO: add logic around defaulting to creating or not

    if create:
      # generate a new Word2Vec model... takes a while!
      # TODO optimize parameters

      transformed_corpus = None
      if bigrams:
        # TODO save the phraser somewhere... but that requires
        # even more arguments.
        # the Phrases class lets you generate bigrams, but the
        # Phraser class is a more compact version of the same
        # TODO making the phrases takes forever, making the phraser
        # takes forever, turning it into a list takes forever... this
        # is really annoying. is there any way to speed it up?
        bigram_generator = phrases.Phraser(phrases.Phrases(corpus))
        # weird bug where the bigram generator won't work unless
        # it's turned into a list first. if you try to do it straight,
        # it'll give you total gibberish. FIXME
        bigram_corpus = list(bigram_generator[corpus])
        transformed_corpus = bigram_corpus
      else:
        # no bigrams, same old corpus
        transformed_corpus = corpus

      self.model = word2vec.Word2Vec(transformed_corpus, workers=8)
      self.model.save(self.model_path)
    else:
      self.model = word2vec.Word2Vec.load(self.model_path)
예제 #8
0
def word_modeling(tokens):
    from gensim.corpora import Dictionary
    from gensim.models import phrases, LdaModel

    bigram = phrases.Phraser(phrases.Phrases(tokens, min_count=2))
    for i, ts in enumerate(tokens):
        for btoken in bigram[ts]:
            if '_' in btoken and btoken not in tokens[i]:
                tokens[i].append(btoken)

    token_dict = Dictionary(tokens)
    corpus = [token_dict.doc2bow(t) for t in tokens]

    _ = token_dict[0]
    model = LdaModel(corpus=corpus, id2word=token_dict.id2token, chunksize=len(tokens), alpha="auto",
                     eta="auto", iterations=400, num_topics=20, passes=20, eval_every=None)
    pprint.pprint(model.top_topics(corpus))
    def addSentence(self, sentence):
        try:
            #f=open("w2v_"+self.name,"r")
            model = gensim.models.KeyedVectors.load_word2vec_format("w2v_" +
                                                                    self.name)
            weights = model.syn0
        except FileNotFoundError:
            print(len(sentence))
            ph = phrases.Phrases(sentence)
            bigram_transformer = phrases.Phraser(ph)
            trigram = phrases.Phrases(bigram_transformer[sentence])
            ngram = phrases.Phrases(trigram[sentence])
            #ngram=phrases.Phrases(trigram[bigram_transformer[sentence]])
            model = Word2Vec(ngram[trigram[bigram_transformer[sentence]]],
                             size=40000,
                             window=5,
                             min_count=1,
                             workers=4,
                             sg=0,
                             iter=80)
            model.wv.save_word2vec_format("w2v_" + self.name)
            #print(sentence[1:10])
            #print("Fresh :",model["fresh"])
            #print("ताजा :",model["ताजा"])
            weights = model.wv.syn0
        #print(weights)
        np.save(open("embed" + self.name + ".txt", 'wb'), weights)

        vocab = dict([(k, v.index) for k, v in model.vocab.items()])
        with open("vocab" + self.name + ".txt", 'w', encoding='utf-8') as f:
            f.write(json.dumps(vocab))
        with open("vocab" + self.name + ".txt", 'r', encoding='utf-8') as f:
            data = json.loads(f.read())
        self.word2index = data
        self.index2word = dict([(v, k) for k, v in data.items()])
        self.n_words = len(model.vocab)

        print(self.name + ":", self.n_words)
예제 #10
0
 def train(self, data_iterator, **kwargs):
     # Train the phraser from gensim
     self.phraser = gensim_phrases.Phraser(
         gensim_phrases.Phrases(data_iterator, **kwargs))
예제 #11
0
    def save(self, kind, bigrams=True):

        print('Initializing split word phraser')
        stream = self.stream('sentences', 'list')
        split_word_model = phrases.Phrases(self.stream('sentences', 'list'))
        # first, reunite words that shouldn't be split;
        # remove all bigrams that don't merge into a real word
        split_word_phraser = phrases.Phraser(split_word_model)
        for word_tuple in list(split_word_phraser.phrasegrams.keys()):
            if not word_tuple[0] + word_tuple[1] in nlp.vocab:
                del split_word_phraser.phrasegrams[word_tuple]
        # we don't want the merged words to have a delimiter in them
        split_word_phraser.delimiter = b''

        if bigrams is True:
            print('Initializing bigram phraser')
            # now we actually look for bigrams
            stream = self.stream('sentences', 'list')
            bigram_model = phrases.Phrases(split_word_phraser[stream])

            # this phraser will catch bigrams that are very unique but less
            bigram_model.min_count = 20
            bigram_model.threshold = 90
            bigram_phraser_threshold = phrases.Phraser(bigram_model)

            # this one will catch bigrams that are less unique but very common
            bigram_model.min_count = 70
            bigram_model.threshold = 60
            bigram_phraser_count = phrases.Phraser(bigram_model)

        if kind == 'documents':
            save_path = self.save_dir.joinpath('line_documents.txt')
        elif kind == 'sentences':
            sp.call(['rm -rf {}/line_sentences'.format(self.save_dir.name)],
                    shell=True)
            save_dir = self.save_dir.joinpath('line_sentences')
            save_dir.mkdir(exist_ok=True)

        for i, tokenized_text in enumerate(self.stream('documents', 'spacy')):
            print('Writing {} in line-{} format'.format(
                self.raw_paths[i].name, kind))

            if kind == 'sentences':
                save_path = save_dir.joinpath(self.raw_paths[i].name + '.txt')
            if kind == 'documents':
                document_tokens = []
            with save_path.open('a') as save_file:
                for sentence in tokenized_text.sents:
                    sentence_tokens = []
                    for token in sentence:
                        if token.pos_ in ['PROPN', 'NUM']:
                            sentence_tokens.append(token.pos_)
                        elif token.is_alpha and token.is_ascii and not token.is_oov:
                            sentence_tokens.append(token.text)

                    sentence_tokens = split_word_phraser[sentence_tokens]
                    if bigrams is True:
                        sentence_tokens = bigram_phraser_threshold[
                            sentence_tokens]
                        sentence_tokens = bigram_phraser_count[sentence_tokens]

                    if kind == 'sentences':
                        sentence_string = ' '.join(sentence_tokens)
                        if len(sentence_string) > 0:
                            save_file.write(sentence_string + '\n')

                    if kind == 'documents':
                        document_tokens += sentence_tokens
                if kind == 'documents':
                    document_string = ' '.join(document_tokens)
                    save_file.write(document_string + '\n')
예제 #12
0
    sentences_copy = sentences

    threshold = 8.0

    print("Beginning multi-gram accumulation.")

    # I want to keep making larger and larger n-grams until I think
    # there are no more to be made.
    while True:
        bigram = Phrases(sentences_copy, threshold=threshold)
        bigrams = bigram.export_phrases(sentences_copy)
        z = list(set(bigrams) - set(unigrams))
        if len(z) == 0 or threshold > 12:
            break
        else:
            gram_bigram = gmp.Phraser(bigram)
            sentences_copy = gram_bigram[sentences_copy]
            unigrams = bigrams
            grams.append(gram_bigram)
            threshold += 1

    # Maybe there's a more elegant solution to this, but this alters
    # the Keras code in a minimal way.
    def gram_er(sentences):
        temp = sentences
        for g in grams:
            temp = g[temp]
        return temp

    num_words = len(set([i for k in sentences for i in k]))