def process_data( self, data_id: int, data: str, add_document: bool ) -> Optional[dict[str, float]]: """ Preprocesses and processes a document. :param data_id: The document's ID. :param data: The content of the document. :param add_document: Whether the document should immediately be added to the TF.IDF collection. :return: The TF.IDF scores of each term in the document, unless the document was added to the collection. In that case nothing is returned. """ preprocessed = remove_stopwords(lemmatize(split_text(data))) if not add_document: return self.tfidf.process_document(preprocessed) else: index = self.tfidf.add_document(preprocessed) self.data_ids[data_id] = index
def preprocess_doc(row, context=True): citation_sentence = str(row['context']) if lda_params['markers']: citation_sentence = preprocessing.remove_markers(citation_sentence) if lda_params['tokenize']: citation_sentence = preprocessing.tokenize(citation_sentence) if lda_params['pos_tags'] != (): tags = preprocessing.lower( preprocessing.filter_pos_tags(citation_sentence, tags=lda_params['pos_tags'])) if lda_params['punctuation']: citation_sentence = preprocessing.remove_punctuation(citation_sentence) if lda_params['numbers']: citation_sentence = preprocessing.remove_numbers(citation_sentence) citation_sentence = preprocessing.lower(citation_sentence) if lda_params['bigrams']: bigrams = preprocessing.get_bigrams(citation_sentence) if lda_params['trigrams']: trigrams = preprocessing.get_trigrams(citation_sentence) if lda_params['common_stopwords']: citation_sentence = preprocessing.remove_common_stopwords( citation_sentence) if lda_params['custom_stopwords']: citation_sentence = preprocessing.remove_custom_stopwords( citation_sentence) if lda_params['pos_tags'] != (): citation_sentence = preprocessing.filter_pos(citation_sentence, tags) citation_sentence = preprocessing.clean_doc(citation_sentence) if lda_params['bigrams']: bigrams = preprocessing.filter_n_grams(bigrams, citation_sentence) if lda_params['trigrams']: trigrams = preprocessing.filter_n_grams(trigrams, citation_sentence) if lda_params['bigrams'] and not lda_params['trigrams']: citation_sentence = citation_sentence + bigrams if lda_params['trigrams'] and not lda_params['bigrams']: citation_sentence = citation_sentence + trigrams if lda_params['bigrams'] and lda_params['trigrams']: citation_sentence = citation_sentence + bigrams + trigrams if lda_params['lemmatize']: citation_sentence = preprocessing.lemmatize(citation_sentence) citation_sentence = preprocessing.clean_doc(citation_sentence) return citation_sentence
def build_model(documents): if lda_params['markers']: documents = map(preprocessing.remove_markers, documents) if lda_params['tokenize']: documents = map(preprocessing.tokenize, documents) documents = list(documents) if lda_params['pos_tags'] != (): tags = [ preprocessing.lower( preprocessing.filter_pos_tags(doc, tags=lda_params['pos_tags'])) for doc in documents ] if lda_params['punctuation']: documents = [ preprocessing.remove_punctuation(doc) for doc in documents ] if lda_params['numbers']: documents = [preprocessing.remove_numbers(doc) for doc in documents] documents = [preprocessing.lower(doc) for doc in documents] if lda_params['bigrams']: bigrams = [preprocessing.get_bigrams(doc) for doc in documents] if lda_params['trigrams']: trigrams = [preprocessing.get_trigrams(doc) for doc in documents] if lda_params['common_stopwords']: documents = [ preprocessing.remove_common_stopwords(doc) for doc in documents ] if lda_params['custom_stopwords']: documents = [ preprocessing.remove_custom_stopwords(doc) for doc in documents ] if lda_params['pos_tags'] != (): documents = [ preprocessing.filter_pos(documents[i], tags[i]) for i in range(0, len(documents)) ] documents = [preprocessing.clean_doc(doc) for doc in documents] if lda_params['bigrams']: bigrams = [ preprocessing.filter_n_grams(bigrams[i], documents[i]) for i in range(0, len(documents)) ] if lda_params['trigrams']: trigrams = [ preprocessing.filter_n_grams(trigrams[i], documents[i]) for i in range(0, len(documents)) ] if lda_params['bigrams'] and not lda_params['trigrams']: documents = [ documents[i] + bigrams[i] for i in range(0, len(documents)) ] if lda_params['trigrams'] and not lda_params['bigrams']: documents = [ documents[i] + trigrams[i] for i in range(0, len(documents)) ] if lda_params['bigrams'] and lda_params['trigrams']: documents = [ documents[i] + bigrams[i] + trigrams[i] for i in range(0, len(documents)) ] if lda_params['lemmatize']: documents = [preprocessing.lemmatize(doc) for doc in documents] documents = [preprocessing.clean_doc(doc) for doc in documents] documents = [doc for doc in documents if doc] dictionary = generate_dictionary(documents) corpus = generate_corpus(documents, dictionary) lda_model = generate_lda_model(corpus, dictionary, lda_params['num_topics']) if not os.path.exists(lda_params['model_dir']): os.makedirs(lda_params['model_dir']) dictionary.save(lda_params['model_dir'] + 'lda.dict') gensim.corpora.MmCorpus.serialize(lda_params['model_dir'] + 'lda.mm', corpus) lda_model.save(lda_params['model_dir'] + 'lda.model') with open(lda_params['model_dir'] + 'lda.docs', 'wb') as docs_file: pickle.dump(documents, docs_file, pickle.HIGHEST_PROTOCOL) with open(lda_params['model_dir'] + 'lda_params.config', 'w') as config_file: config_file.write(str(lda_params))
# Rimuovo le canzoni con genere mancante darklyrics = darklyrics[darklyrics.apply( lambda x: 'MISSING' not in x['genre'], axis=1)] # generi = [lista for lista in darklyrics['genre'] if len(lista)>1] # Trasformo da multi-label a singola label, da valutare # darklyrics['genre'] = darklyrics.apply(lambda x: singularizegenre(x['genre']), axis=1) # Magia print("fix unicode") darklyrics['lyrics'] = darklyrics.apply( lambda x: fix_wrong_unicode(x['lyrics']), axis=1) # Pulizia dei token print("tokenize") darklyrics['tokens'] = darklyrics.apply(lambda x: tokenize(x['lyrics']), axis=1) print("remove repetitions") # Rimuove i token con lettere multiple tipo aaaarggghhh -> argh darklyrics['tokens'] = darklyrics.apply( lambda x: remove_repetitions(x['tokens']), axis=1) print("lemmatize") darklyrics['tokens'] = darklyrics.apply(lambda x: lemmatize(x['tokens']), axis=1) darklyrics = darklyrics.drop('lyrics', axis=1) darklyrics.to_csv('darklyrics-tokens.csv', index=False)
def extract(infile, outfile, dict_keys, stem=False, lemma=False, element="narrative", arg_rebalance=""): train = False narratives = [] keywords = [] # Get the xml from file root = etree.parse(infile).getroot() if dict_keys == None: train = True # Set up the keys for the feature vector dict_keys = ["MG_ID", labelname] if checklist in featurenames: dict_keys = dict_keys + ["CL_DeathAge", "CL_ageunit", "CL_DeceasedSex", "CL_Occupation", "CL_Marital", "CL_Hypertension", "CL_Heart", "CL_Stroke", "CL_Diabetes", "CL_TB", "CL_HIV", "CL_Cancer", "CL_Asthma","CL_InjuryHistory", "CL_SmokeD", "CL_AlcoholD", "CL_ApplytobaccoD"] elif dem in featurenames: dict_keys = dict_keys + ["CL_DeathAge", "CL_DeceasedSex"] print "dict_keys: " + str(dict_keys) #keywords = set([]) #narrwords = set([]) print "train: " + str(train) print "stem: " + str(stem) print "lemma: " + str(lemma) # Extract features matrix = [] for child in root: features = {} if rec_type in featurenames: features["CL_" + rec_type] = child.tag # CHECKLIST features for key in dict_keys: if key[0:3] == "CL_": key = key[3:] item = child.find(key) value = "0" if item != None: value = item.text if key == "AlcoholD" or key == "ApplytobaccoD": if value == 'N': value = 9 features[key] = value #print "-- value: " + value #if key == "MG_ID": # print "extracting features from: " + value # KEYWORD features if kw_features: keyword_string = get_keywords(child) # Remove punctuation and trailing spaces from keywords words = [s.strip().translate(string.maketrans("",""), string.punctuation) for s in keyword_string.split(',')] # Split keyword phrases into individual words for word in words: w = word.split(' ') words.remove(word) for wx in w: words.append(wx.strip().strip('–')) keywords.append(" ".join(words)) # NARRATIVE features if narr_features or ((not train) and (symp_train in featurenames)): narr_string = "" item = child.find(element) if item != None: if item.text != None: narr_string = item.text.encode("utf-8") else: print "warning: empty narrative" narr_words = [w.strip() for w in narr_string.lower().translate(string.maketrans("",""), string.punctuation).split(' ')] text = " ".join(narr_words) if stem: narr_string = preprocessing.stem(text) elif lemma: narr_string = preprocessing.lemmatize(text) narratives.append(narr_string.strip().lower()) #print "Adding narr: " + narr_string.lower() # SYMPTOM features elif train and (symp_train in featurenames): narr_string = "" item = child.find("narrative_symptoms") if item != None: item_text = item.text if item_text != None and len(item_text) > 0: narr_string = item.text.encode("utf-8") #narr_words = [w.strip() for w in narr_string.lower().translate(string.maketrans("",""), string.punctuation).split(' ')] narratives.append(narr_string.lower()) print "Adding symp_narr: " + narr_string.lower() # Save features matrix.append(features) # Construct the feature matrix # COUNT or TFIDF features if narr_count in featurenames or kw_count in featurenames or narr_tfidf in featurenames or kw_tfidf in featurenames or lda in featurenames or symp_train in featurenames: documents = [] if narr_count in featurenames or narr_tfidf in featurenames or lda in featurenames or symp_train in featurenames: documents = narratives print "narratives: " + str(len(narratives)) elif kw_count in featurenames or kw_tfidf in featurenames: documents = keywords print "keywords: " + str(len(keywords)) # Create count matrix global count_vectorizer if train: print "training count_vectorizer" count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(min_ngram,max_ngram),stop_words=stopwords) count_vectorizer.fit(documents) dict_keys = dict_keys + count_vectorizer.get_feature_names() print "transforming data with count_vectorizer" count_matrix = count_vectorizer.transform(documents) matrix_keys = count_vectorizer.get_feature_names() print "writing count matrix to file" out_matrix = open(infile + ".countmatrix", "w") out_matrix.write(str(count_matrix)) out_matrix.close() # Add count features to the dictionary for x in range(len(matrix)): feat = matrix[x] for i in range(len(matrix_keys)): key = matrix_keys[i] val = count_matrix[x,i] feat[key] = val # Convert counts to TFIDF if (narr_tfidf in featurenames) or (kw_tfidf in featurenames): print "converting to tfidf..." print "matrix_keys: " + str(len(matrix_keys)) # Use the training count matrix for fitting if train: global tfidfTransformer tfidfTransformer = sklearn.feature_extraction.text.TfidfTransformer() tfidfTransformer.fit(count_matrix) # Convert matrix to tfidf tfidf_matrix = tfidfTransformer.transform(count_matrix) print "count_matrix: " + str(count_matrix.shape) print "tfidf_matrix: " + str(tfidf_matrix.shape) # Replace features in matrix with tfidf for x in range(len(matrix)): feat = matrix[x] #values = tfidf_matrix[x,0:] #print "values: " + str(values.shape[0]) for i in range(len(matrix_keys)): key = matrix_keys[i] val = tfidf_matrix[x,i] feat[key] = val # LDA topic modeling features if lda in featurenames: global ldaModel if train: ldaModel = LatentDirichletAllocation(n_topics=num_topics) ldaModel.fit(count_matrix) lda_matrix = ldaModel.transform(count_matrix) for t in range(0,num_topics): dict_keys.append("lda_topic_" + str(t)) for x in range(len(matrix)): for y in range(len(lda_matrix[x])): val = lda_matrix[x][y] matrix[x]["lda_topic_" + str(y)] = val # TODO: Print LDA topics # WORD2VEC features elif narr_vec in featurenames: print "Warning: using word2vec features, ignoring all other features" # Create word2vec mapping word2vec, dim = load_word2vec(vecfile) # Convert words to vectors and add to matrix dict_keys.append(narr_vec) global max_seq_len max_seq_len = 200 #if train: #max_seq_len = 0 print "word2vec dim: " + str(dim) print "initial max_seq_len: " + str(max_seq_len) zero_vec = [] for z in range(0, dim): zero_vec.append(0) for x in range(len(matrix)): narr = narratives[x] #print "narr: " + narr vectors = [] vec = zero_vec for word in narr.split(' '): if len(word) > 0: #if word == "didnt": # word = "didn't" if word in word2vec: vec = word2vec[word] vectors.append(vec) length = len(vectors) if length > max_seq_len: #if train: # max_seq_len = length vectors = vectors[(-1*max_seq_len):] (matrix[x])[narr_vec] = vectors # Pad the narr_vecs with 0 vectors print "padding vectors to reach maxlen " + str(max_seq_len) for x in range(len(matrix)): length = len(matrix[x][narr_vec]) matrix[x]['max_seq_len'] = max_seq_len if length < max_seq_len: for k in range(0, max_seq_len-length): matrix[x][narr_vec].insert(0,zero_vec) # use insert for pre-padding # narr_seq for RNN elif narr_seq in featurenames: global vocab_size, max_seq_len if train: dict_keys.append(narr_seq) dict_keys.append('vocab_size') dict_keys.append('max_seq_len') vocab = set() for narr in narratives: words = narr.split(' ') for word in words: vocab.add(word) vocab_size = len(vocab) max_seq_len = 0 sequences = [] # Convert text into integer sequences for x in range(len(matrix)): narr = narratives[x] seq = hashing_trick(narr, vocab_size, hash_function='md5', filters='\t\n', lower=True, split=' ') if len(seq) > max_seq_len: max_seq_len = len(seq) sequences.append(seq) # Pad the sequences sequences = pad_sequences(sequences, maxlen=max_seq_len, dtype='int32', padding='pre') for x in range(len(matrix)): matrix[x]['narr_seq'] = sequences[x] matrix[x]['vocab_size'] = vocab_size matrix[x]['max_seq_len'] = max_seq_len #if arg_rebalance != "": # matrix_re = rebalance_data(matrix, dict_keys, arg_rebalance) # write_to_file(matrix_re, dict_keys, outfile) #else: data_util.write_to_file(matrix, dict_keys, outfile)
def convert_sents(doc): s = flatten(lemmatize(doc)) return [x for x in s if len(x) > 1]