def get_skipgram(tweets, nIn, kIn): #tokenization and preprocess (if not yet done) must be done here. when analyzer receives #a callable, it will not perform tokenization, see documentation tweet_tokenized = [] for t in tweets: tweet_tokenized.append(nlp.tokenize(t)) skipper = functools.partial(skipgrams, n=nIn, k=kIn) vectorizer = TfidfVectorizer( analyzer=skipper, #stop_words=nlp.stopwords, # We do better when we keep stopwords use_idf=True, smooth_idf=False, norm=None, # Applies l2 norm smoothing decode_error='replace', max_features=10000, min_df=2, max_df=0.501) # for t in cleaned_tweets: # tweetTokens = word_tokenize(t) # skipgram_feature_matrix.append(list(skipper(tweetTokens))) # Fit the text into the vectorizer. logger.info("\tgenerating skip-gram vectors, n={}, k={}, {}".format( nIn, kIn, datetime.datetime.now())) tfidf = vectorizer.fit_transform(tweet_tokenized).toarray() logger.info("\t\t complete, dim={}, {}".format(tfidf.shape, datetime.datetime.now())) vocab = {v: i for i, v in enumerate(vectorizer.get_feature_names())} return tfidf, vocab
def select_input_words(sent: str): orig_toks = nlp.tokenize(sent, 2) #keep original norm_toks = nlp.tokenize(sent, 1) #lemmatize pos_tags = nlp.get_pos_tags(orig_toks) selected = set() for i in range(0, len(pos_tags)): word = orig_toks[i].lower() if word in nlp.stopwords or len(word) < 2: continue norm = norm_toks[i] tag = pos_tags[i] if tag in ["NN", "NNS", "NNP", "NNPS"]: selected.add(norm) return selected
def text_to_vector_gensim(text, model, text_length, dim, text_norm_option, word_weigts: list = None): """ Given a string, normalizes it, then splits it into words and finally converts it to a sequence of word vectors. """ text = nlp.normalize(text) words = nlp.tokenize(text, text_norm_option) window = words[-text_length:] x = np.zeros((text_length, dim)) random_candidates = [ ] # list of word indexes in the embedding model to be randomly chosen words_matched = set( ) # track words that already found in the embedding model and whose vectors are already used for i, word in enumerate(window): weight = get_word_weight(word_weigts, word) is_in_model = False if word in model.wv.vocab.keys(): is_in_model = True vec = model.wv[word] vec = vec * weight x[i, :] = vec words_matched.add(word) if not is_in_model: if word in GLOBAL_embedding_randomized_vectors.keys(): vec = GLOBAL_embedding_randomized_vectors[word] else: if len(GLOBAL_embedding_vocab_indexes) == 0: for n in range(0, len(model.wv.vocab.keys())): GLOBAL_embedding_vocab_indexes.append(n) random.Random(4).shuffle(GLOBAL_embedding_vocab_indexes) while (True): index = GLOBAL_embedding_vocab_indexes.pop() word = model.wv.index2word[index] if not word in words_matched: words_matched.add(word) break vec = model.wv[word] GLOBAL_embedding_randomized_vectors[word] = vec vec = vec * weight x[i, :] = vec return x
def extract_words(line: str): line = str(line).replace("LETTERNUMBER", "") line = str(line).replace("NUMBER", "") norm_toks = nlp.tokenize(str(line), 1) words = [] for nt in norm_toks: word = nt.lower() if word in nlp.stopwords or len(word) < 3: continue words.append(word) return words
def find_word_matches(dictionary, target_text, text_normalization_option): target_text = nlp.normalize_tweet(target_text) norm_toks = set(nlp.tokenize(target_text, text_normalization_option)) scoresum = 0 matchsum = 0 matchmax = 0 matchbool = 0 for w, score in dictionary.items(): score=float(score) if w in norm_toks: matchbool = 1 matchsum += 1 scoresum += score if matchmax < score: matchmax = score return scoresum, matchsum, matchmax, matchbool
def text_to_vector_fasttext(text, ft_model, text_length, dim, text_norm_option, word_weigts: list = None): """ Given a string, normalizes it, then splits it into words and finally converts it to a sequence of word vectors. """ text = nlp.normalize(text) words = nlp.tokenize(text, text_norm_option) window = words[-text_length:] x = np.zeros((text_length, dim)) for i, word in enumerate(window): vec = ft_model.get_word_vector(word).astype('float32') weight = get_word_weight(word_weigts, word) vec = vec * weight x[i, :] = vec return x
def fit_fasttext_holdout(df: DataFrame, split_at_row: int, class_col: int, outfolder: str, task: str, text_norm_option: int, text_input_info: dict, embedding_file: str): # X, y, embedding_file, nfold, outfolder: str, task: str): encoder = LabelBinarizer() y = df[:, class_col] print("\ttotal y rows=" + str(len(y)) + " with unique values=" + str(len(set(y)))) print("\tencoding y labels..." + str(datetime.datetime.now())) if len(set(y)) > 2: y_int = encoder.fit_transform(y) else: y_int = np.array([[1, 0] if l.strip() == 'CG' else [0, 1] for l in y]) y_label_lookup = dict() y_label_lookup_inverse = dict() for index, l in zip(y_int.argmax(1), y): y_label_lookup[index] = l y_label_lookup_inverse[l] = index # print(l+","+str(index)) X = [] text_length = 0 index = 0 for row in df: text = "" for b in range(len(text_input_info)): info = text_input_info[b] t = concate_text(row, info["text_col"]) t = nlp.normalize(t) text_length += int(info["text_length"]) text += t + " " words = nlp.tokenize(text, text_norm_option) text = " ".join(words).strip() X.append([text]) index += 1 X = numpy.asarray(X, dtype=str) # perform n-fold validation (we cant use scikit-learn's wrapper as we used Keras functional api above X_train_ = X[0:split_at_row] y_train_ = y[0:split_at_row] X_test_ = X[split_at_row:] y_test_ = y[split_at_row:] # prepare fasttext data fasttext_train = outfolder + "/fasttext_train.tsv" with open(fasttext_train, mode='w') as outfile: csvwriter = csv.writer(outfile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) for i in range(len(X_train_)): label = y_train_[i] text = X_train_[i][0] csvwriter.writerow(["__label__" + label.replace(" ", "|"), text]) # fasttext_test = outfolder + "/fasttext_test.tsv" # with open(fasttext_test, mode='w') as outfile: # csvwriter = csv.writer(outfile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) # for i in range(len(X_test_)): # label = y_test_[i] # text = X_test_[i][0] # csvwriter.writerow(["__label__" + label, text]) # -dim 300 -minn 4 -maxn 10 -wordNgrams 3 -neg 10 -loss ns -epoch 3000 -thread 30 if embedding_file is not None and embedding_file.lower() != 'none': model = fasttext.train_supervised(input=fasttext_train, minn=4, maxn=10, wordNgrams=3, neg=10, loss='ns', epoch=3000, thread=30, dim=dmc.DNN_EMBEDDING_DIM, pretrainedVectors=embedding_file) else: model = fasttext.train_supervised(input=fasttext_train, minn=4, maxn=10, wordNgrams=3, neg=10, loss='ns', epoch=3000, thread=30, dim=dmc.DNN_EMBEDDING_DIM) # evaluate the model X_test_as_list = [] for row in X_test_: X_test_as_list.append(row[0]) predictions = model.predict(X_test_as_list)[0] predicted_labels = [] for i in predictions: label = i[0] l = label[9:] l = l.replace("|", " ") predicted_labels.append(y_label_lookup_inverse[l]) util.save_scores(predicted_labels, y_int[split_at_row:, :].argmax(1), "dnn", task, "_fasttext_", 3, outfolder)
def fit_fasttext(df: DataFrame, nfold: int, class_col: int, outfolder: str, task: str, text_norm_option: int, text_input_info: dict, embedding_file: str): # X, y, embedding_file, nfold, outfolder: str, task: str): print("\t running fasttext using embedding file=" + str(embedding_file)) encoder = LabelBinarizer() y = df[:, class_col] y_int = encoder.fit_transform(y) y_label_lookup = dict() y_label_lookup_inverse = dict() for index, l in zip(y_int.argmax(1), y): y_label_lookup[index] = l y_label_lookup_inverse[l] = index # print(l+","+str(index)) X = [] text_length = 0 index = 0 for row in df: text = "" for b in range(len(text_input_info)): info = text_input_info[b] text += concate_text(row, info["text_col"]) + " " text_length += int(info["text_length"]) text = nlp.normalize(text) words = nlp.tokenize(text, text_norm_option) text = " ".join(words).strip() X.append([text]) index += 1 X = numpy.asarray(X, dtype=str) # perform n-fold validation (we cant use scikit-learn's wrapper as we used Keras functional api above kfold = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=cl.RANDOM_STATE) splits = list(enumerate(kfold.split(X, y_int.argmax(1)))) nfold_predictions = dict() for k in range(0, len(splits)): print("\tnfold=" + str(k)) # Fit the model X_train_index = splits[k][1][0] X_test_index = splits[k][1][1] X_train_ = X[X_train_index] y_train_ = y[X_train_index] X_test_ = X[X_test_index] y_test_ = y[X_test_index] # prepare fasttext data fasttext_train = outfolder + "/fasttext_train.tsv" with open(fasttext_train, mode='w') as outfile: csvwriter = csv.writer(outfile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) for i in range(len(X_train_)): label = y_train_[i] text = X_train_[i][0] csvwriter.writerow( ["__label__" + label.replace(" ", "|"), text]) # fasttext_test = outfolder + "/fasttext_test.tsv" # with open(fasttext_test, mode='w') as outfile: # csvwriter = csv.writer(outfile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) # for i in range(len(X_test_)): # label = y_test_[i] # text = X_test_[i][0] # csvwriter.writerow(["__label__" + label, text]) # -dim 300 -minn 4 -maxn 10 -wordNgrams 3 -neg 10 -loss ns -epoch 3000 -thread 30 if embedding_file is not None: model = fasttext.train_supervised(input=fasttext_train, minn=4, maxn=10, wordNgrams=3, neg=10, loss='ns', epoch=3000, thread=30, dim=dmc.DNN_EMBEDDING_DIM, pretrainedVectors=embedding_file) else: model = fasttext.train_supervised(input=fasttext_train, minn=4, maxn=10, wordNgrams=3, neg=10, loss='ns', epoch=3000, thread=30, dim=dmc.DNN_EMBEDDING_DIM) # evaluate the model X_test_as_list = [] for row in X_test_: X_test_as_list.append(row[0]) predictions = model.predict(X_test_as_list)[0] for i in range(len(X_test_index)): index = X_test_index[i] label = predictions[i][0] l = label[9:] l = l.replace("|", " ") nfold_predictions[index] = y_label_lookup_inverse[l] indexes = sorted(list(nfold_predictions.keys())) predicted_labels = [] for i in indexes: predicted_labels.append(nfold_predictions[i]) util.save_scores(predicted_labels, y_int.argmax(1), "dnn", task, "_fasttext_", 3, outfolder)
def extract_dict(label_to_proftext: dict): # frequency based score label_vocab_to_totalfreq = dict() vocab_overall_frequency = dict() label_to_nouns = dict() label_to_verbs = dict() for label, texts in label_to_proftext.items(): print(label + "," + str(len(texts))) vocab_score = dict() # identify verbs and nouns for this label nouns = set() verbs = set() for t in texts: #count+=1 #print(count) orig_toks = nlp.tokenize(t, 2) stem_toks = nlp.tokenize(t, text_normalization_option) pos_tags = nlp.get_pos_tags(orig_toks) for i in range(0, len(pos_tags)): word = orig_toks[i].lower() if word in nlp.stopwords or len(word) < 2: continue stem = stem_toks[i] if stem in vocab_score.keys(): vocab_score[stem] += 1 else: vocab_score[stem] = 1 tag = pos_tags[i] if tag in ["NN", "NNS", "NNP", "NNPS"]: nouns.add(stem_toks[i]) elif tag in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]: verbs.add(stem_toks[i]) label_to_nouns[label] = nouns label_to_verbs[label] = verbs label_vocab_to_totalfreq[label] = vocab_score for e, frequency in vocab_score.items(): if frequency == 0: continue if e in vocab_overall_frequency.keys(): vocab_overall_frequency[e] += frequency else: vocab_overall_frequency[e] = frequency # calculate weighted score label_vocab_to_weightedscore = dict() for label, vocab_freq in label_vocab_to_totalfreq.items(): vocab_score = dict() for e, frequency in vocab_freq.items(): if e not in vocab_overall_frequency.keys(): continue totalfreq = vocab_overall_frequency[e] s = frequency / totalfreq if s == 1.0: continue vocab_score[e] = s label_vocab_to_weightedscore[label] = vocab_score return label_vocab_to_totalfreq, label_vocab_to_weightedscore, label_to_nouns, label_to_verbs