예제 #1
0
def phraseBuilder(df_sentences):
    # creating bigram Gensim Phrases:
    bigram = phrases.Phrases(delimiter='_')

    # convert df to list of sentences:
    sentences = []
    for index, row in df_sentences.iterrows():
        print(row['Feedback'])
        trans = row['Feedback']
        trans = trans.encode('ascii', 'ignore')
        sentence = [
            word for word in word_tokenize(cleanTranscript(trans))
            if word.isalpha()
        ]
        sentences.append(sentence)
        bigram.add_vocab([sentence])

    finalTrans = []

    # creating trigram Gensim Phrases
    trigram = phrases.Phrases(bigram[sentences], delimiter='_')
    for sen in trigram[bigram[sentences]]:
        trigramSen = ' '.join(w for w in sen)
        finalTrans.append(trigramSen)

    # assign interaction_id to these trigram Phrases that we will chunk to get useful context
    finalTransSeries = pd.Series(finalTrans)
    df_sentences = df_sentences.drop('Feedback', 1)
    df_sentences['finalTrans'] = finalTransSeries.values
    return df_sentences
예제 #2
0
def clean_text(df_input,
               col='content',
               remove_unusual=False,
               remove_stopwords=False,
               toRemove=[],
               remove_numbers=False,
               stem_words=False,
               lemmatize=False,
               nGram=False):
    # Clean mails
    if remove_stopwords:
        toRemove.extend(stopWords())
    usual_words = []
    if remove_unusual:
        usual_words = usualWords()
    ## Clean content of mails
    # tokenization and lemmatization/stemming
    df_input[col] = df_input[col].map(
        lambda x: text_to_words(x, remove_numbers, stem_words, lemmatize))
    # removing stopwords and unusual words
    df_input[col] = df_input[col].map(lambda x: remove_words(
        x, remove_unusual, remove_stopwords, toRemove, usual_words))
    # bigrams and trigrams
    if nGram:
        phrase = phrases.Phrases(df_input[col], min_count=30, threshold=300)
        bigram = phrases.Phraser(phrase)
        trigram = phrases.Phrases(bigram[df_input[col]])
        df_input[col] = [trigram[bigram[sent]] for sent in df_input[col]]

    print("data cleaned")
    return df_input
예제 #3
0
def phraseBuilder(df_sentences):

    #creating bigram Gensim Phrases:
    bigram = phrases.Phrases(delimiter='-')

    #convert df to list of sentences:
    sentences = []
    for index, row in df_sentences.iterrows():
        sentence = [word for word in word_tokenize(row['phrase'])]
        sentences.append(sentence)
        bigram.add_vocab([sentence])

    finalTrans = []

    # creating trigram Gensim Phrases
    trigram = phrases.Phrases(bigram[sentences], delimiter='-')
    for sen in trigram[bigram[sentences]]:
        trigramSen = ' '.join(w for w in sen)
        finalTrans.append(trigramSen)

    #assign interaction_id to these trigram Phrases that we will chunk to get useful context
    finalTransSeries = pd.Series(finalTrans)
    df_sentences = df_sentences.drop('phrase', 1)
    df_sentences['finalTrans'] = finalTransSeries.values
    return df_sentences
예제 #4
0
 def __init__(self, corpus_dir=None):
     super().__init__(corpus_dir=corpus_dir)
     self.sentences = self.get_sentences()
     model_save_path = os.path.join(
         self.save_dir, "%s_model.pkl" % self.__class__.__name__)
     try:
         with open(model_save_path, 'rb') as f:
             self._bigrams, self._trigrams = pickle.load(f)
     except FileNotFoundError:
         self._bigrams = gen_phrases.Phrases(self.sentences)
         self._trigrams = gen_phrases.Phrases(self._bigrams[self.sentences])
         with open(model_save_path, 'wb') as f:
             pickle.dump((self._bigrams, self._trigrams), f)
예제 #5
0
def main(args):
    """Reads csv data file containing sentences, tokenizes and uses them to train word2vec model"""
    data = pd.read_csv(args['data_csv'], index_col=0)

    # tokenize and preprocess sentences
    sentences = [
        stripword(row.translate(translator).lower).split(' ')
        for row in data['Sentence']
    ]

    # create bigrams to capture word combinations (e.g. New_York)
    bigram_transformer = phrases.Phrases(sentences)
    bigram = phrases.Phraser(bigram_transformer)

    # train word2vec model according to the hyperparameters chosen
    currentmodel = Word2Vec(bigram[sentences],
                            workers=-1,
                            sg=0,
                            size=args['model_size'],
                            min_count=5,
                            window=['window_size'],
                            sample=1e-3)
    currentmodel.init_sims(replace=True)
    currentmodel.save("app/word2vec/word2vec_retrained")
    print('Saved as app/word2vec/word2vec_retrained')
예제 #6
0
def get_collocations(text, verbose = True, bigram_freq = True):
    if (verbose):
        print('Word Tokenization...')
    tokens = [t.split() for t in text]
    
    if (verbose):
        print('Making Bigramer Model...')
        
    bigramer = phrases.Phrases(tokens)  # train model with default settings
    
    
    if (bigram_freq):   
        if (verbose):
            print('Making Bigramer list...')
        
        bigram_counter = list()
        bigram_list = list(bigramer.vocab.items())
        for key, value in bigram_list:
            str_key = key.decode()
            if len(str_key.split("_")) > 1:
                bigram_counter.append(tuple([str_key, value]))
        bigram_df = pd.DataFrame(bigram_counter, columns=['bigrams', 'count'])

    
    if (bigram_freq):
        res_dict = {'bigramer': bigramer, 'bigram_freq': bigram_df}
    else:
        res_dict = {'bigramer': bigramer, 'bigram_freq': None}
    
    return(res_dict)
예제 #7
0
    def ngram_counts(self, clean_sentences):
        ''' threshold is for PMI score and min_count is for word counts in order to not miss any 
            person that is seldom make the min_count =1 '''
        phrase = phrases.Phrases(clean_sentences,
                                 min_count=1,
                                 threshold=2,
                                 delimiter=' ')
        #len(phrase.vocab.keys())
        bigrams = phrases.Phrases(phrase[clean_sentences],
                                  min_count=1,
                                  threshold=2,
                                  delimiter=' ')
        #len(bigrams.vocab.keys())
        trigrams = phrases.Phrases(bigrams[clean_sentences],
                                   min_count=1,
                                   threshold=2,
                                   delimiter=' ')
        #len(trigrams.vocab.keys())

        #unigram_count_dict={}
        #bigram_count_dict={}
        #trigram_count_dict={}
        #ngram_count_dict=trigrams.vocab
        """
        i=0
        for k in trigrams.vocab.keys():
            if i%100000==0:
                print i,"done"
                

            if len(k.split("_"))>=4:
                quadgram_count_dict[k]=trigrams.vocab[k]
            elif len(k.split("_"))==3:
                trigram_count_dict[k]=trigrams.vocab[k]
            elif len(k.split("_"))==2:
                bigram_count_dict[k]=trigrams.vocab[k]
            else:
                unigram_count_dict[k]=trigrams.vocab[k]

            i=i+1 
        
        """

        os.chdir(self.inter_data_path)
        pickle.dump(trigrams.vocab, open("ngram_count_dict.pkl", "wb"))
def train_phrase():
    sentence_stream = list()
    for doc in documentList:
        wordlist = doc.split(" ")
        sentence_stream.append(wordlist)

    ps = phrase.Phrases(sentence_stream)
    bigram = phrase.Phraser(ps)
    return bigram
예제 #9
0
def phrase_detection(df):
    """ Given the emails dataframe, form bigrams based on the text in "Body" field """
    sentences = [text.split() for text in df["Body"]]
    phrases_ = phrases.Phrases(sentences,
                               min_count=params.bigrams_min_count,
                               threshold=params.bigrams_threshold)
    bigram = phrases.Phraser(phrases_)
    # for phr, score in phrases_.export_phrases(sentences):
    #     print(u'{0}   {1}'.format(phr, score))
    return bigram
예제 #10
0
def make_bigram(dirpaths):
    sentences = corpora(dirpaths, loop_or_not=False)
    print('Start phrasing:')
    phrase = phrases.Phrases(sentences,
                             max_vocab_size=DICTLENGTH,
                             min_count=1,
                             threshold=5,
                             common_terms={'of', 'and', 'the', 'with'})
    bigram = phrases.Phraser(phrase)
    bigram.save(SAVED_BIGRAM_PATH)
    print('bigram phraser saved conclude.')
예제 #11
0
def train():
    choo_choo_train = word2vec.LineSentence(INPUT_FILE)
    bigram = phrases.Phrases(
        sentences=choo_choo_train,
        min_count=50,
        threshold=10.0,
    )
    trigram = phrases.Phrases(
        sentences=bigram[choo_choo_train],
        min_count=50,
        threshold=10.0,
    )
    model = word2vec.Word2Vec(
        sentences=trigram[choo_choo_train],
        min_count=100,
        size=100,
        workers=4,
    )
    model.init_sims(replace=True)
    model.save(OUTPUT_FILE)
예제 #12
0
  def __init__(self, model_path, create=False, corpus=None, bigrams=True):
    """
    Initializes the rewriter, given a particular Word2Vec corpus.
    A good example corpus is the Wikipedia Text8Corpus.
    You only need the corpus if you are recreating the model from scratch.

    If ``create == True``, this generates a new Word2Vec
    model (which takes a really long time to build.) If ``False``, this loads
    an existing model we already saved.

    :param str model_path: where to store the model files. This file
        needn't exist, but its parent folder should.
    :param bool create: True to create a new Word2Vec model, False to
        use the one stored at ``model_path``.
    :param Iterable corpus: only needed if ``create=True``. Defines a corpus
        for Word2Vec to learn from.
    :param bool bigrams: only needed if ``create=True``. If True, takes some
        more time to build a model that supports bigrams (e.g. `new_york`).
        Otherwise, it'll only support one-word searches. ``bigram=True`` makes
        this slower but more complete.
    """

    self.model_path = model_path

    # TODO: add logic around defaulting to creating or not

    if create:
      # generate a new Word2Vec model... takes a while!
      # TODO optimize parameters

      transformed_corpus = None
      if bigrams:
        # TODO save the phraser somewhere... but that requires
        # even more arguments.
        # the Phrases class lets you generate bigrams, but the
        # Phraser class is a more compact version of the same
        # TODO making the phrases takes forever, making the phraser
        # takes forever, turning it into a list takes forever... this
        # is really annoying. is there any way to speed it up?
        bigram_generator = phrases.Phraser(phrases.Phrases(corpus))
        # weird bug where the bigram generator won't work unless
        # it's turned into a list first. if you try to do it straight,
        # it'll give you total gibberish. FIXME
        bigram_corpus = list(bigram_generator[corpus])
        transformed_corpus = bigram_corpus
      else:
        # no bigrams, same old corpus
        transformed_corpus = corpus

      self.model = word2vec.Word2Vec(transformed_corpus, workers=8)
      self.model.save(self.model_path)
    else:
      self.model = word2vec.Word2Vec.load(self.model_path)
    def addSentence(self, sentence):
        try:
            #f=open("w2v_"+self.name,"r")
            model = gensim.models.KeyedVectors.load_word2vec_format("w2v_" +
                                                                    self.name)
            weights = model.syn0
        except FileNotFoundError:
            print(len(sentence))
            ph = phrases.Phrases(sentence)
            bigram_transformer = phrases.Phraser(ph)
            trigram = phrases.Phrases(bigram_transformer[sentence])
            ngram = phrases.Phrases(trigram[sentence])
            #ngram=phrases.Phrases(trigram[bigram_transformer[sentence]])
            model = Word2Vec(ngram[trigram[bigram_transformer[sentence]]],
                             size=40000,
                             window=5,
                             min_count=1,
                             workers=4,
                             sg=0,
                             iter=80)
            model.wv.save_word2vec_format("w2v_" + self.name)
            #print(sentence[1:10])
            #print("Fresh :",model["fresh"])
            #print("ताजा :",model["ताजा"])
            weights = model.wv.syn0
        #print(weights)
        np.save(open("embed" + self.name + ".txt", 'wb'), weights)

        vocab = dict([(k, v.index) for k, v in model.vocab.items()])
        with open("vocab" + self.name + ".txt", 'w', encoding='utf-8') as f:
            f.write(json.dumps(vocab))
        with open("vocab" + self.name + ".txt", 'r', encoding='utf-8') as f:
            data = json.loads(f.read())
        self.word2index = data
        self.index2word = dict([(v, k) for k, v in data.items()])
        self.n_words = len(model.vocab)

        print(self.name + ":", self.n_words)
예제 #14
0
def build_phrases(doc_list):
    # creating bigram Gensim Phrases:
    bigram = phrases.Phrases(delimiter='-')
    bigram_phrases = []
    for doc in doc_list:
        for sen in doc:
            sen = sen.replace('\n', '')
            sentence = [word for word in sen.split()]
            #print(sentence)
            bigram_phrases.append(sentence)
            for word in sentence:
                bigram.add_vocab(str(word))
            #bigram.add_vocab(bigram_phrases)

    trigram_phrases = []

    # creating trigram Gensim Phrases
    trigram = phrases.Phrases(delimiter='-')
    for sen in trigram[bigram[bigram_phrases]]:
        trigramSen = ' '.join(w for w in sen)
        trigram_phrases.append(trigramSen)

    return trigram_phrases
예제 #15
0
def build_model(corpus_path, detect_phrase=False) :
    startTime = time.time()

    sentences = word2vec.LineSentence(corpus_path)
    if detect_phrase :
        bigram_transformer = phrases.Phrases(sentences)
        model = word2vec.Word2Vec(bigram_transformer[sentences], size=100, alpha=0.025, window=5, min_count=5, sample=1e-5, workers=4, sg=1)
    else :
        model = word2vec.Word2Vec(sentences, size=100, alpha=0.025, window=5, min_count=5, sample=1e-5, workers=4, sg=1)
    # no more training
    model.init_sims(replace=True)
    durationTime = time.time() - startTime
    sys.stderr.write("duration time = %f\n" % durationTime)
    return model
예제 #16
0
def run_on_all_books(books, bootstrap=True):
    """Runs word2vec training on data.

    Args:
        books: dictionary of titles to text (str)
        bootstrap: whether to bootstrap sample from the sentences

    """

    # Combine all text into a list of sentences
    print("Getting sentences...")
    all_sentences = []
    for title, book in books.items():
        all_sentences.extend(get_sentences(book))

    # Create model
    bigrams = phrases.Phrases(all_sentences,
                              min_count=5,
                              delimiter=b' ',
                              common_terms=stopwords)

    # Create vocabulary of bigrams
    print("Creating vocabulary...")
    vocab = [w for sent in bigrams[all_sentences] for w in sent]
    vocab = [w for w, count in Counter(vocab).most_common() if count >= 5]

    # Save vocab
    with codecs.open(os.path.join(args.output_dir, 'vocab.txt'),
                     'w',
                     encoding='utf-8') as f:
        f.write('\n'.join(vocab))

    # Run word2vec model
    for run_idx in range(args.num_runs):
        print("Run #%d" % run_idx)
        if bootstrap:
            data = bigrams[np.random.choice(all_sentences,
                                            len(all_sentences),
                                            replace=True)]
        else:
            data = bigrams[all_sentences]
        model = word2vec.Word2Vec(data,
                                  size=args.dim,
                                  window=args.window,
                                  sg=1,
                                  min_count=5,
                                  workers=10)
        model.wv.save(os.path.join(args.output_dir, str(run_idx) + '.wv'))
예제 #17
0
def create_phrases_model():
    MyUtils.init_logging("Encode_Common.log")
    logging.info("Starting preparation of phrases...")
    docs_percent_touse = 1  #0.5.
    chunk_size = 10**5

    doc_filenames = [F.DESCDOCS_RAW, F.QADOCS_RAW]
    doc_files = [open(doc_filename, "r") for doc_filename in doc_filenames]
    all_docwords = []
    for doc_file in doc_filenames:
        for docs_chunk in pd.read_csv(doc_file, chunksize=chunk_size):
            len_c = len(docs_chunk)
            words_chunk = []
            indices = list(
                sorted(
                    numpy.random.choice(len_c,
                                        int(docs_percent_touse * len_c),
                                        replace=False)))
            selected_rows = docs_chunk.iloc[indices]
            for tupl in selected_rows.itertuples():
                word_ls = ast.literal_eval(tupl.words)
                words_chunk.append(word_ls)
            all_docwords.extend(words_chunk)
            logging.info("Reading in the documents' words. Chunk processed...")
        logging.info("Completed: reading in a set of documents' words"
                     )  # @ time = " + str(round(time1 - start, 3)))

    logging.info("Number of documents to use in the Phrases model: %s",
                 str(len(all_docwords)))
    del doc_filenames
    del doc_files
    collect()

    phrases_model = phrases.Phrases(sentences=all_docwords,
                                    min_count=20,
                                    threshold=300,
                                    delimiter=b'_',
                                    max_vocab_size=30 * 10**6)
    #phraser_model = phrases.Phraser(phrases_model)
    #time2 = time();
    logging.info(
        "Phrases model created")  #@ time = " + str(round(time2 - start, 3)))
    logging.info("Memory size in MBs = %s",
                 str(mem.asizeof(phrases_model) // 2**20))

    phrases_model.save(F.PHRASES_MODEL)

    return phrases_model
예제 #18
0
def explore_phrase2vec(min_freq, phrases_threshold):
    MyUtils.init_logging("Explore_Phrase2Vec.log")
    words_lls = []
    doc_filenames = [F.DESCDOCS_RAW, F.QADOCS_RAW]
    doc_files = [open(doc_filename, "r") for doc_filename in doc_filenames]
    all_docwords = []
    chunk_size = 10**5
    for doc_file in doc_filenames:
        for docs_chunk in pd.read_csv(doc_file, chunksize=chunk_size):
            len_c = len(docs_chunk)
            words_chunk = []
            #indices = list(sorted(numpy.random.choice(len_c, int(docs_percent_touse * len_c), replace=False)))
            #selected_rows = docs_chunk.iloc[indices]
            for tupl in docs_chunk.itertuples():
                #words = tupl.words.replace("'",'"')
                #logging.info(words)
                #word_ls = json.loads(words)#ast.literal_eval(tupl.words)
                word_ls = eval(tupl.words, {'__builtins__': {}})
                words_chunk.append(word_ls)
            all_docwords.extend(words_chunk)
            logging.info("Added chunk from file %s to documents list...",
                         doc_file)

    logging.info("Number of documents: %s", len(all_docwords))
    phrases_model = phrases.Phrases(sentences=all_docwords,
                                    min_count=min_freq,
                                    threshold=phrases_threshold,
                                    delimiter=b'_')
    #logging.info("***The Phrases model's frequency vocabulary: %s", str(phrases_model.vocab))
    phrases_vocab = phrases_model.vocab
    del phrases_model
    collect()
    sorted_vocabulary = sorted(list(phrases_vocab.items()),
                               key=lambda tpl: tpl[1],
                               reverse=True)
    phrases_sorted_vocabulary = list(
        filter(lambda tpl: '_' in str(tpl[1]), sorted_vocabulary))
    individual_words_sorted_vocabulary = list(
        filter(lambda tpl: not ('_' in str(tpl[1])), sorted_vocabulary))
    logging.info("***The vocabulary of phrases, ordered by frequency : %s ",
                 phrases_sorted_vocabulary)
    logging.info("***The vocabulary of words, ordered by frequency : %s ",
                 individual_words_sorted_vocabulary)
    #phrases_model.save("Exploration_phrasesModel_mincount"+ str(min_freq) + "_T"+str(phrases_threshold) + ".model")

    for i in range(len(words_lls) // 4):
        print(str(phrases_model[words_lls[i]]))
예제 #19
0
def word_modeling(tokens):
    from gensim.corpora import Dictionary
    from gensim.models import phrases, LdaModel

    bigram = phrases.Phraser(phrases.Phrases(tokens, min_count=2))
    for i, ts in enumerate(tokens):
        for btoken in bigram[ts]:
            if '_' in btoken and btoken not in tokens[i]:
                tokens[i].append(btoken)

    token_dict = Dictionary(tokens)
    corpus = [token_dict.doc2bow(t) for t in tokens]

    _ = token_dict[0]
    model = LdaModel(corpus=corpus, id2word=token_dict.id2token, chunksize=len(tokens), alpha="auto",
                     eta="auto", iterations=400, num_topics=20, passes=20, eval_every=None)
    pprint.pprint(model.top_topics(corpus))
예제 #20
0
def cleanDocs(posts):
	stop = set(stopwords.words('english'))
	exclude = set(string.punctuation) 
	lemma = WordNetLemmatizer()
	clean_docs = []
	bigram_docs = []

	for post in posts: 
	    stop_free = " ".join([i for i in post.lower().split() if i not in stop])
	    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
	    digit_free = [word for word in punc_free.split() if not word.isdigit() and len(word) > 2]
	    normalized = " ".join(lemma.lemmatize(word) for word in digit_free)
	    nouns = [word[0] for word in nltk.pos_tag(normalized.split()) if word[1][0] == 'N' or word[1][0] == 'VB'] 
	    clean_docs.append(nouns)

	bigram_transformer = phrases.Phrases(clean_docs)
	
	for doc in bigram_transformer[clean_docs]:
			bigram_docs.append(doc)

	return bigram_docs 
def word2vecmodel_gensim(text):
    corpus = text.values.tolist()
    bigrams = phrases.Phrases(corpus)
    model = gensim.models.Word2Vec(bigrams, min_count=10, size=100)
    return model
예제 #22
0
def main():
    # uncomment the file that we want to assign topic + sentiment to
    df_csv = pd.read_csv("dataset/phase1/non_replies.csv",
                         encoding='ISO-8859-1')
    # df_csv = pd.read_csv("dataset/phase1/replies.csv", encoding='ISO-8859-1')
    # df_csv = pd.read_csv("dataset/phase1/replied_to.csv", encoding='ISO-8859-1')
    # df_csv = pd.read_csv("dataset/phase2/non_replies.csv", encoding='ISO-8859-1')
    # df_csv = pd.read_csv("dataset/phase2/replies.csv", encoding='ISO-8859-1')
    # df_csv = pd.read_csv("dataset/phase2/replied_to.csv", encoding='ISO-8859-1')
    # df_csv = pd.read_csv("dataset/phase3/non_replies.csv", encoding='ISO-8859-1')
    # df_csv = pd.read_csv("dataset/phase3/replies.csv", encoding='ISO-8859-1')
    # df_csv = pd.read_csv("dataset/phase3/replied_to.csv", encoding='ISO-8859-1')
    df_csv.head()
    textList = df_csv.values.tolist()
    print(len(textList))
    text = ""
    #                                                   ------- Loading dataset files -------

    # Uncomment the block corresponding the phase that we want to assign topics and sentiment to only
    # Uncomment entire phase block if in training phase. If in assigning phase uncomment only the file we want to assign to

    # Phase 1

    # with open("dataset/phase1/non_replies.csv", encoding='ISO-8859-1') as csvfile:
    #     text= csvfile.read() # uncomment for either training or assigning phase
    # with open("dataset/phase1/replies.csv", encoding='ISO-8859-1') as csvfile:
    #     # text+= csvfile.read() # uncomment for training phase
    #      text = csvfile.read() # uncomment for assigning phase
    # with open("dataset/phase1/replied_to.csv", encoding='ISO-8859-1') as csvfile:
    #     # text+= csvfile.read() #uncomment for training phase
    #     text = csvfile.read() #uncomment for assigning phase

    # Phase 2

    # with open("dataset/phase2/non_replies.csv", encoding='ISO-8859-1') as csvfile:
    #     text= csvfile.read() # uncomment for either training or assigning phase
    # with open("dataset/phase2/replies.csv", encoding='ISO-8859-1') as csvfile:
    #     # text+= csvfile.read() # uncomment for training phase
    #      text = csvfile.read() # uncomment for assigning phase
    # with open("dataset/phase2/replied_to.csv", encoding='ISO-8859-1') as csvfile:
    #     # text+= csvfile.read() #uncomment for training phase
    #     text = csvfile.read() #uncomment for assigning phase

    # Phase 3

    # with open("dataset/phase3/non_replies.csv", encoding='ISO-8859-1') as csvfile:
    #     text= csvfile.read() # uncomment for either training or assigning phase
    # with open("dataset/phase3/replies.csv", encoding='ISO-8859-1') as csvfile:
    #     # text+= csvfile.read() # uncomment for training phase
    #      text = csvfile.read() # uncomment for assigning phase
    # with open("dataset/phase3/replied_to.csv", encoding='ISO-8859-1') as csvfile:
    #     # text+= csvfile.read() #uncomment for training phase
    #     text = csvfile.read() #uncomment for assigning phase

    #                                                   ------- Generating corpus -------
    nlp = spacy.load("en_core_web_sm")

    my_stop_words = [
        'https', 'co', 'from', 'text', 'subject', 're', 'edu', 'use', 'RT',
        'make', 'jerusalemembassy', 'jerusalem', 'Jerusalem', 'amp',
        'JerusalemEmbassy', 'usembassyjerusalem'
    ]
    for stopword in my_stop_words:
        lexeme = nlp.vocab[stopword]
        lexeme.is_stop = True
    nlp.max_length = 1547045
    doc = nlp(text)
    texts, article = [], []
    for w in doc:
        if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and not w.like_url and w.is_ascii and not w.is_left_punct and not w.is_right_punct and w.lang_ == 'en' and w.is_alpha:
            article.append(w.lemma_)

        if w.text == '\n':
            texts.append(article)
            article = []

    texts = [x for x in texts if x != []]

    bigram = phrases.Phrases(texts)
    texts = [bigram[line] for line in texts]

    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    #                                                   ------- create LDA model (if in training phase) -------

    mallet_path = os.path.join('C:\\', 'new-mallet', 'mallet-2.0.8', 'bin',
                               'mallet.bat')

    # ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=10, id2word=dictionary)

    # ldamallet.save("phaseOne_full_model")
    # ldamallet.save("phaseTwo_full_model")
    # ldamallet.save("phaseThree_full_model")

    #                                                   ------- Loading LDA model (if in assigning phase) -------

    ldamallet = gensim.models.wrappers.LdaMallet.load(
        "LDAmodels\\phaseOne_full_model")

    # ldamallet = gensim.models.wrappers.LdaMallet.load("LDAmodels\\phaseTwo_full_model")
    # ldamallet = gensim.models.wrappers.LdaMallet.load("LDAmodels\\phaseThree_full_model")

    #                                                   ------- Assigning topics to each text -------

    def format_topics_sentences(ldamodel, corpus, texts):
        # Init output
        sent_topics_df = pd.DataFrame()
        # Get main topic in each document
        for i, row in enumerate(ldamodel[corpus]):
            row = sorted(row, key=lambda x: (x[1]), reverse=True)
            # Get the Dominant topic, Perc Contribution and Keywords for each document
            dom_topic = ""
            perc_contrib = ""
            keywords = ""
            for j, (topic_num, prop_topic) in enumerate(row):
                if j == 0 or j == 1:  # => top 2 dominant topics
                    wp = ldamodel.show_topic(topic_num)
                    topic_keywords = ", ".join([word for word, prop in wp])
                    dom_topic += str(topic_num)
                    perc_contrib += str(round(prop_topic, 4))
                    keywords += topic_keywords
                else:
                    sent_topics_df = sent_topics_df.append(pd.Series(
                        [dom_topic, perc_contrib, keywords]),
                                                           ignore_index=True)
                    break
        sent_topics_df.columns = [
            'Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'
        ]
        # Add original text to the end of the output
        contents = pd.Series(texts)
        sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
        return (sent_topics_df)

    #                                                   ------- Assigning sentiment to each text -------
    def getSentiment():
        analyzer = SentimentIntensityAnalyzer()
        sentimentResults = []
        for text in textList:
            for tweet in text:
                tweet = str(tweet)
                vs = analyzer.polarity_scores(tweet)
                if (vs['compound'] > 0.1):
                    sentimentResults.append("positive")
                elif (vs['compound'] < -0.1):
                    sentimentResults.append("negative")
                else:
                    sentimentResults.append("neutral")
        return sentimentResults

    #                                                   ------- Calling topic and sentiment assignment functions -------
    df_topic_sents_keywords = pd.DataFrame()
    df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamallet,
                                                      corpus=corpus,
                                                      texts=textList)
    sentimentColumn = pd.Series(getSentiment())
    #                                                   ------- Adding sentiment column to full dataframe -------
    df_topic_sents_keywords = pd.concat(
        [df_topic_sents_keywords, sentimentColumn], axis=1)

    # Format
    df_dominant_topic = df_topic_sents_keywords.reset_index()
    df_dominant_topic.columns = [
        'Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords',
        'Text', 'Sentiment'
    ]
예제 #23
0
    def save(self, kind, bigrams=True):

        print('Initializing split word phraser')
        stream = self.stream('sentences', 'list')
        split_word_model = phrases.Phrases(self.stream('sentences', 'list'))
        # first, reunite words that shouldn't be split;
        # remove all bigrams that don't merge into a real word
        split_word_phraser = phrases.Phraser(split_word_model)
        for word_tuple in list(split_word_phraser.phrasegrams.keys()):
            if not word_tuple[0] + word_tuple[1] in nlp.vocab:
                del split_word_phraser.phrasegrams[word_tuple]
        # we don't want the merged words to have a delimiter in them
        split_word_phraser.delimiter = b''

        if bigrams is True:
            print('Initializing bigram phraser')
            # now we actually look for bigrams
            stream = self.stream('sentences', 'list')
            bigram_model = phrases.Phrases(split_word_phraser[stream])

            # this phraser will catch bigrams that are very unique but less
            bigram_model.min_count = 20
            bigram_model.threshold = 90
            bigram_phraser_threshold = phrases.Phraser(bigram_model)

            # this one will catch bigrams that are less unique but very common
            bigram_model.min_count = 70
            bigram_model.threshold = 60
            bigram_phraser_count = phrases.Phraser(bigram_model)

        if kind == 'documents':
            save_path = self.save_dir.joinpath('line_documents.txt')
        elif kind == 'sentences':
            sp.call(['rm -rf {}/line_sentences'.format(self.save_dir.name)],
                    shell=True)
            save_dir = self.save_dir.joinpath('line_sentences')
            save_dir.mkdir(exist_ok=True)

        for i, tokenized_text in enumerate(self.stream('documents', 'spacy')):
            print('Writing {} in line-{} format'.format(
                self.raw_paths[i].name, kind))

            if kind == 'sentences':
                save_path = save_dir.joinpath(self.raw_paths[i].name + '.txt')
            if kind == 'documents':
                document_tokens = []
            with save_path.open('a') as save_file:
                for sentence in tokenized_text.sents:
                    sentence_tokens = []
                    for token in sentence:
                        if token.pos_ in ['PROPN', 'NUM']:
                            sentence_tokens.append(token.pos_)
                        elif token.is_alpha and token.is_ascii and not token.is_oov:
                            sentence_tokens.append(token.text)

                    sentence_tokens = split_word_phraser[sentence_tokens]
                    if bigrams is True:
                        sentence_tokens = bigram_phraser_threshold[
                            sentence_tokens]
                        sentence_tokens = bigram_phraser_count[sentence_tokens]

                    if kind == 'sentences':
                        sentence_string = ' '.join(sentence_tokens)
                        if len(sentence_string) > 0:
                            save_file.write(sentence_string + '\n')

                    if kind == 'documents':
                        document_tokens += sentence_tokens
                if kind == 'documents':
                    document_string = ' '.join(document_tokens)
                    save_file.write(document_string + '\n')
예제 #24
0
 def train(self, data_iterator, **kwargs):
     # Train the phraser from gensim
     self.phraser = gensim_phrases.Phraser(
         gensim_phrases.Phrases(data_iterator, **kwargs))
예제 #25
0
        return porterStemmer.stem(text)
    return porterStemmer.stem_sentence(text)


def getTokenizedSentences(text):
    sentences = splitToSentences(text)
    tokenized_sentences = []
    for sentence in sentences:
        tokenized_sentence = [
            token for token in tokenize(sentence, lower=True)
        ]
        tokenized_sentences.append(tokenized_sentence)
    return tokenized_sentences


bigram_phrases = phrases.Phrases(min_count=1, threshold=1)


def getbigramTokenizedSentences(text):
    """ note that this is an on going implemetation, thus order of articles might change the results"""
    tokenized_sentences = getTokenizedSentences(text)
    bigram_phrases.add_vocab(tokenized_sentences)
    bigram_sentences = [
        bigram_phrases[sentence] for sentence in tokenized_sentences
    ]
    return bigram_sentences


def getWordAndBigrams2Freq(text):
    tokenized_sentences = getTokenizedSentences(text)
    bigram_phrases.add_vocab(tokenized_sentences)
예제 #26
0
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import TaggedDocument
from gensim.models.word2vec import Word2Vec
from gensim.models import phrases

df = pd.read_csv("dataset", header=1, names=["A", "B", "C", "D"])

##########################use same preprocessing as specific dataset#######################################################################################################

bigrams = phrases.Phrases(sentences)

epoch_list = [50]
size_list = [10]

for x in epoch_list:
    for y in size_list:

        vec_size = y
        max_epochs = x
        mode = 0
        model = Word2Vec(size=vec_size, sg=mode, iter=max_epochs)
        model.build_vocab(bigrams[sentences])
        model.train(bigrams[sentences],
                    total_examples=model.corpus_count,
                    epochs=model.epochs)
        model.save("10vector50epoch_new_tweet_word2vec_model")
print(data.head())
for text in tqdm(data['body']):
    if type(text) is str:
        text = text.lower()  #lowercase
        tokens = [
            tokenize.word_tokenize(t) for t in tokenize.sent_tokenize(text)
        ]
        sentences_2010.extend(tokens)

#train models
print('training (unigram) model')
model1a = Word2Vec(sentences_2010)  #default model, min_counts = 5
model1a.save("models/ED_4cat_snapshot1_lowercase.model")

print('training bigram model')
bigrams = phrases.Phrases(sentences_2010)
model1b = Word2Vec(bigrams[sentences_2010])  #default model, min_counts = 5
model1b.save("models/ED_4cat_snapshot1_bigrams_lowercase.model")

#%%
#GENERATE MODELS FOR SECOND ED SNAPSHOT LOWERCASE (2020)
sentences_2020_lower = []

data = pd.read_csv('ED_data/ED_data_2020.csv')
print(data.head())
for text in tqdm(data['body']):
    text = text.lower()  #make lowercase
    tokens = [tokenize.word_tokenize(t) for t in tokenize.sent_tokenize(text)]
    sentences_2020_lower.extend(tokens)

print('training first 2020 model')