def topic_finder(document): """ Main method for finding topics in text, document. :param document: sample text :return: list of possible topics """ topics_list = [] try: important_nouns = document.find_topic() trigram_tagger = pickle.load(open('trained_tagger.pkl', 'rb')) # train_tagger() sentences = pre.tokenize_to_sentences( pre.remove_punctuation(document.sample)) sentences = [pre.tokenize_to_words(sent) for sent in sentences] sentences = [ sentence for sentence in sentences if important_nouns[0].lower() in [word.lower() for word in sentence] ] tagged_sentences = [trigram_tagger.tag(sent) for sent in sentences] svo_data = [ get_svo(sentence, important_nouns[0]) for sentence in tagged_sentences ] for svo in svo_data: sentence = '' for word in svo: sentence += word[0] + ' ' topics_list.append(sentence) except IndexError: if not topics_list: topics_list.append("Topic not found. Need more data.") return topics_list
def preprocess_doc(doc): doc = preprocessing.tokenize(doc) doc = preprocessing.remove_punctuation(doc) doc = preprocessing.remove_numbers(doc) doc = preprocessing.lower(doc) doc = preprocessing.remove_common_stopwords(doc) doc = preprocessing.clean_doc(doc) return doc
def preprocess_corpus(documents): documents = list(map(preprocessing.tokenize, documents)) documents = [preprocessing.remove_punctuation(doc) for doc in documents] documents = [preprocessing.remove_numbers(doc) for doc in documents] documents = [preprocessing.lower(doc) for doc in documents] documents = [preprocessing.remove_common_stopwords(doc) for doc in documents] documents = [preprocessing.clean_doc(doc) for doc in documents] documents = [doc for doc in documents if doc] return documents
def full_preprocessing(self): """General preprocessing on document sample. This method include: remove punctuation ( . and , are kept), remove english stop words, tokenize to sentences, words and list of tokenized sentences to words.""" self.text = pre.remove_punctuation(self.text) self.text = pre.to_lowercase(self.text) self.words = pre.tokenize_to_words(self.text) self.words = pre.remove_stopwords(self.words) self.text = ' '.join(self.words) self.sentences = pre.tokenize_to_sentences(self.text) self.normalized_sample = [pre.tokenize_to_words(sent) for sent in self.sentences] return self.sentences
def clean_data(): """ Clean the Tweet by removing punctuations and stop words :return cleaned data: """ data = sc.textFile("data/data.txt") col_rdd = data.map(lambda x: (x.split('\t')[0], x[-1])) punctuation_removed_rdd = col_rdd.map( lambda x: (remove_punctuation(x[0]), float(x[1]))) data_df = sqlContext.createDataFrame(punctuation_removed_rdd, ["text", "label"]) remover = StopWordsRemover(inputCol="text", outputCol="words", stopWords=stopwords.words('english')) return remover.transform(data_df).select(["label", "words"])
def preprocess_doc(row, context=True): citation_sentence = str(row['context']) if lda_params['markers']: citation_sentence = preprocessing.remove_markers(citation_sentence) if lda_params['tokenize']: citation_sentence = preprocessing.tokenize(citation_sentence) if lda_params['pos_tags'] != (): tags = preprocessing.lower( preprocessing.filter_pos_tags(citation_sentence, tags=lda_params['pos_tags'])) if lda_params['punctuation']: citation_sentence = preprocessing.remove_punctuation(citation_sentence) if lda_params['numbers']: citation_sentence = preprocessing.remove_numbers(citation_sentence) citation_sentence = preprocessing.lower(citation_sentence) if lda_params['bigrams']: bigrams = preprocessing.get_bigrams(citation_sentence) if lda_params['trigrams']: trigrams = preprocessing.get_trigrams(citation_sentence) if lda_params['common_stopwords']: citation_sentence = preprocessing.remove_common_stopwords( citation_sentence) if lda_params['custom_stopwords']: citation_sentence = preprocessing.remove_custom_stopwords( citation_sentence) if lda_params['pos_tags'] != (): citation_sentence = preprocessing.filter_pos(citation_sentence, tags) citation_sentence = preprocessing.clean_doc(citation_sentence) if lda_params['bigrams']: bigrams = preprocessing.filter_n_grams(bigrams, citation_sentence) if lda_params['trigrams']: trigrams = preprocessing.filter_n_grams(trigrams, citation_sentence) if lda_params['bigrams'] and not lda_params['trigrams']: citation_sentence = citation_sentence + bigrams if lda_params['trigrams'] and not lda_params['bigrams']: citation_sentence = citation_sentence + trigrams if lda_params['bigrams'] and lda_params['trigrams']: citation_sentence = citation_sentence + bigrams + trigrams if lda_params['lemmatize']: citation_sentence = preprocessing.lemmatize(citation_sentence) citation_sentence = preprocessing.clean_doc(citation_sentence) return citation_sentence
def get_defaults(query={ "controller_status": "", "lumen": "", "voltage": "", "driver_current": "" }, number_top_matches: int = 2, ds=data, ds_c_status_embeddings: list = [], ds_lm_embeddings: list = [], ds_v_embeddings: list = [], ds_dcurr_embeddings: list = [], query_embeddings: dict = {}, model=model, preprocess_ds: bool = False): for key in query: if len(query[key]): query[key] = pr.to_lower(query[key]) query[key] = pr.remove_punctuation(query[key]) if preprocess_ds: ds = ds.fillna("") ds["controller_status"] = ds["controller_status"].apply( lambda x: pr.to_lower(x)) ds["controller_status"] = ds["controller_status"].apply( lambda x: pr.remove_punctuation(x)) ds["lumen"] = ds["lumen"].apply(lambda x: pr.to_lower(x)) ds["lumen"] = ds["lumen"].apply(lambda x: pr.remove_punctuation(x)) ds["voltage"] = ds["voltage"].apply(lambda x: pr.to_lower(x)) ds["voltage"] = ds["voltage"].apply(lambda x: pr.remove_punctuation(x)) ds["driver_current"] = ds["driver_current"].apply( lambda x: pr.to_lower(x)) ds["driver_current"] = ds["driver_current"].apply( lambda x: pr.remove_punctuation(x)) if not ds_c_status_embeddings: ds_c = ds["controller_status"].tolist() ds_c_status_embeddings = [model.encode(c) for c in ds_c] if not ds_lm_embeddings: ds_lm = ds["lumen"].tolist() ds_lm_embeddings = [model.encode(lm) for lm in ds_lm] if not ds_v_embeddings: ds_v = ds["voltage"].tolist() ds_v_embeddings = [model.encode(v) for v in ds_v] if not ds_dcurr_embeddings: ds_dcurr = ds["driver_current"].tolist() ds_dcurr_embeddings = [model.encode(curr) for curr in ds_dcurr] scores = [] query_embeddings = {} if len(query["controller_status"]): query_embeddings["controller_status"] = model.encode( query["controller_status"]) contr_scores = compute_similarity( query_embeddings["controller_status"], ds_c_status_embeddings) scores.append(contr_scores) if len(query["lumen"]): query_embeddings["lumen"] = model.encode(query["lumen"]) lm_scores = compute_similarity(query_embeddings["lumen"], ds_lm_embeddings) scores.append(lm_scores) if len(query["voltage"]): query_embeddings["voltage"] = model.encode(query["voltage"]) v_scores = compute_similarity(query_embeddings["voltage"], ds_v_embeddings) scores.append(v_scores) if len(query["driver_current"]): query_embeddings["driver_current"] = model.encode( query["driver_current"]) v_scores = compute_similarity(query_embeddings["driver_current"], ds_dcurr_embeddings) scores.append(v_scores) mean_score = np.mean(scores, axis=0).tolist() top_indices = [np.array(mean_score).argsort()[-number_top_matches:][::-1]] #sorted_scores = sorted(mean_score, reverse=True) #top_scores = sorted_scores[:number_top_matches] similar_cases = ds.iloc[top_indices[0]] result_df = pd.DataFrame() result_df["reason"] = similar_cases["reason"] return result_df
def build_model(documents): if lda_params['markers']: documents = map(preprocessing.remove_markers, documents) if lda_params['tokenize']: documents = map(preprocessing.tokenize, documents) documents = list(documents) if lda_params['pos_tags'] != (): tags = [ preprocessing.lower( preprocessing.filter_pos_tags(doc, tags=lda_params['pos_tags'])) for doc in documents ] if lda_params['punctuation']: documents = [ preprocessing.remove_punctuation(doc) for doc in documents ] if lda_params['numbers']: documents = [preprocessing.remove_numbers(doc) for doc in documents] documents = [preprocessing.lower(doc) for doc in documents] if lda_params['bigrams']: bigrams = [preprocessing.get_bigrams(doc) for doc in documents] if lda_params['trigrams']: trigrams = [preprocessing.get_trigrams(doc) for doc in documents] if lda_params['common_stopwords']: documents = [ preprocessing.remove_common_stopwords(doc) for doc in documents ] if lda_params['custom_stopwords']: documents = [ preprocessing.remove_custom_stopwords(doc) for doc in documents ] if lda_params['pos_tags'] != (): documents = [ preprocessing.filter_pos(documents[i], tags[i]) for i in range(0, len(documents)) ] documents = [preprocessing.clean_doc(doc) for doc in documents] if lda_params['bigrams']: bigrams = [ preprocessing.filter_n_grams(bigrams[i], documents[i]) for i in range(0, len(documents)) ] if lda_params['trigrams']: trigrams = [ preprocessing.filter_n_grams(trigrams[i], documents[i]) for i in range(0, len(documents)) ] if lda_params['bigrams'] and not lda_params['trigrams']: documents = [ documents[i] + bigrams[i] for i in range(0, len(documents)) ] if lda_params['trigrams'] and not lda_params['bigrams']: documents = [ documents[i] + trigrams[i] for i in range(0, len(documents)) ] if lda_params['bigrams'] and lda_params['trigrams']: documents = [ documents[i] + bigrams[i] + trigrams[i] for i in range(0, len(documents)) ] if lda_params['lemmatize']: documents = [preprocessing.lemmatize(doc) for doc in documents] documents = [preprocessing.clean_doc(doc) for doc in documents] documents = [doc for doc in documents if doc] dictionary = generate_dictionary(documents) corpus = generate_corpus(documents, dictionary) lda_model = generate_lda_model(corpus, dictionary, lda_params['num_topics']) if not os.path.exists(lda_params['model_dir']): os.makedirs(lda_params['model_dir']) dictionary.save(lda_params['model_dir'] + 'lda.dict') gensim.corpora.MmCorpus.serialize(lda_params['model_dir'] + 'lda.mm', corpus) lda_model.save(lda_params['model_dir'] + 'lda.model') with open(lda_params['model_dir'] + 'lda.docs', 'wb') as docs_file: pickle.dump(documents, docs_file, pickle.HIGHEST_PROTOCOL) with open(lda_params['model_dir'] + 'lda_params.config', 'w') as config_file: config_file.write(str(lda_params))