示例#1
0
文件: main.py 项目: JulesDesmet/lsi
    def process_data(
        self, data_id: int, data: str, add_document: bool
    ) -> Optional[dict[str, float]]:
        """
        Preprocesses and processes a document.

        :param data_id: The document's ID.
        :param data: The content of the document.
        :param add_document: Whether the document should immediately be added to the
            TF.IDF collection.
        :return: The TF.IDF scores of each term in the document, unless the document was
            added to the collection. In that case nothing is returned.
        """
        preprocessed = remove_stopwords(lemmatize(split_text(data)))
        if not add_document:
            return self.tfidf.process_document(preprocessed)
        else:
            index = self.tfidf.add_document(preprocessed)
            self.data_ids[data_id] = index
示例#2
0
def preprocess_doc(row, context=True):
    citation_sentence = str(row['context'])
    if lda_params['markers']:
        citation_sentence = preprocessing.remove_markers(citation_sentence)
    if lda_params['tokenize']:
        citation_sentence = preprocessing.tokenize(citation_sentence)
    if lda_params['pos_tags'] != ():
        tags = preprocessing.lower(
            preprocessing.filter_pos_tags(citation_sentence,
                                          tags=lda_params['pos_tags']))
    if lda_params['punctuation']:
        citation_sentence = preprocessing.remove_punctuation(citation_sentence)
    if lda_params['numbers']:
        citation_sentence = preprocessing.remove_numbers(citation_sentence)
    citation_sentence = preprocessing.lower(citation_sentence)
    if lda_params['bigrams']:
        bigrams = preprocessing.get_bigrams(citation_sentence)
    if lda_params['trigrams']:
        trigrams = preprocessing.get_trigrams(citation_sentence)
    if lda_params['common_stopwords']:
        citation_sentence = preprocessing.remove_common_stopwords(
            citation_sentence)
    if lda_params['custom_stopwords']:
        citation_sentence = preprocessing.remove_custom_stopwords(
            citation_sentence)
    if lda_params['pos_tags'] != ():
        citation_sentence = preprocessing.filter_pos(citation_sentence, tags)
    citation_sentence = preprocessing.clean_doc(citation_sentence)
    if lda_params['bigrams']:
        bigrams = preprocessing.filter_n_grams(bigrams, citation_sentence)
    if lda_params['trigrams']:
        trigrams = preprocessing.filter_n_grams(trigrams, citation_sentence)
    if lda_params['bigrams'] and not lda_params['trigrams']:
        citation_sentence = citation_sentence + bigrams
    if lda_params['trigrams'] and not lda_params['bigrams']:
        citation_sentence = citation_sentence + trigrams
    if lda_params['bigrams'] and lda_params['trigrams']:
        citation_sentence = citation_sentence + bigrams + trigrams
    if lda_params['lemmatize']:
        citation_sentence = preprocessing.lemmatize(citation_sentence)
    citation_sentence = preprocessing.clean_doc(citation_sentence)
    return citation_sentence
示例#3
0
def build_model(documents):
    if lda_params['markers']:
        documents = map(preprocessing.remove_markers, documents)
    if lda_params['tokenize']:
        documents = map(preprocessing.tokenize, documents)
    documents = list(documents)
    if lda_params['pos_tags'] != ():
        tags = [
            preprocessing.lower(
                preprocessing.filter_pos_tags(doc,
                                              tags=lda_params['pos_tags']))
            for doc in documents
        ]
    if lda_params['punctuation']:
        documents = [
            preprocessing.remove_punctuation(doc) for doc in documents
        ]
    if lda_params['numbers']:
        documents = [preprocessing.remove_numbers(doc) for doc in documents]
    documents = [preprocessing.lower(doc) for doc in documents]
    if lda_params['bigrams']:
        bigrams = [preprocessing.get_bigrams(doc) for doc in documents]
    if lda_params['trigrams']:
        trigrams = [preprocessing.get_trigrams(doc) for doc in documents]
    if lda_params['common_stopwords']:
        documents = [
            preprocessing.remove_common_stopwords(doc) for doc in documents
        ]
    if lda_params['custom_stopwords']:
        documents = [
            preprocessing.remove_custom_stopwords(doc) for doc in documents
        ]
    if lda_params['pos_tags'] != ():
        documents = [
            preprocessing.filter_pos(documents[i], tags[i])
            for i in range(0, len(documents))
        ]
    documents = [preprocessing.clean_doc(doc) for doc in documents]
    if lda_params['bigrams']:
        bigrams = [
            preprocessing.filter_n_grams(bigrams[i], documents[i])
            for i in range(0, len(documents))
        ]
    if lda_params['trigrams']:
        trigrams = [
            preprocessing.filter_n_grams(trigrams[i], documents[i])
            for i in range(0, len(documents))
        ]
    if lda_params['bigrams'] and not lda_params['trigrams']:
        documents = [
            documents[i] + bigrams[i] for i in range(0, len(documents))
        ]
    if lda_params['trigrams'] and not lda_params['bigrams']:
        documents = [
            documents[i] + trigrams[i] for i in range(0, len(documents))
        ]
    if lda_params['bigrams'] and lda_params['trigrams']:
        documents = [
            documents[i] + bigrams[i] + trigrams[i]
            for i in range(0, len(documents))
        ]
    if lda_params['lemmatize']:
        documents = [preprocessing.lemmatize(doc) for doc in documents]
    documents = [preprocessing.clean_doc(doc) for doc in documents]
    documents = [doc for doc in documents if doc]

    dictionary = generate_dictionary(documents)
    corpus = generate_corpus(documents, dictionary)
    lda_model = generate_lda_model(corpus, dictionary,
                                   lda_params['num_topics'])

    if not os.path.exists(lda_params['model_dir']):
        os.makedirs(lda_params['model_dir'])
    dictionary.save(lda_params['model_dir'] + 'lda.dict')
    gensim.corpora.MmCorpus.serialize(lda_params['model_dir'] + 'lda.mm',
                                      corpus)
    lda_model.save(lda_params['model_dir'] + 'lda.model')
    with open(lda_params['model_dir'] + 'lda.docs', 'wb') as docs_file:
        pickle.dump(documents, docs_file, pickle.HIGHEST_PROTOCOL)
    with open(lda_params['model_dir'] + 'lda_params.config',
              'w') as config_file:
        config_file.write(str(lda_params))
示例#4
0
    # Rimuovo le canzoni con genere mancante
    darklyrics = darklyrics[darklyrics.apply(
        lambda x: 'MISSING' not in x['genre'], axis=1)]
    # generi = [lista for lista in darklyrics['genre'] if len(lista)>1]

    # Trasformo da multi-label a singola label, da valutare
    # darklyrics['genre'] = darklyrics.apply(lambda x: singularizegenre(x['genre']), axis=1)

    # Magia
    print("fix unicode")
    darklyrics['lyrics'] = darklyrics.apply(
        lambda x: fix_wrong_unicode(x['lyrics']), axis=1)

    # Pulizia dei token
    print("tokenize")
    darklyrics['tokens'] = darklyrics.apply(lambda x: tokenize(x['lyrics']),
                                            axis=1)

    print("remove repetitions")
    # Rimuove i token con lettere multiple tipo aaaarggghhh -> argh
    darklyrics['tokens'] = darklyrics.apply(
        lambda x: remove_repetitions(x['tokens']), axis=1)

    print("lemmatize")
    darklyrics['tokens'] = darklyrics.apply(lambda x: lemmatize(x['tokens']),
                                            axis=1)

    darklyrics = darklyrics.drop('lyrics', axis=1)

    darklyrics.to_csv('darklyrics-tokens.csv', index=False)
def extract(infile, outfile, dict_keys, stem=False, lemma=False, element="narrative", arg_rebalance=""):
    train = False
    narratives = []
    keywords = []
    
    # Get the xml from file
    root = etree.parse(infile).getroot()

    if dict_keys == None:
        train = True

        # Set up the keys for the feature vector
        dict_keys = ["MG_ID", labelname]
        if checklist in featurenames:
            dict_keys = dict_keys + ["CL_DeathAge", "CL_ageunit", "CL_DeceasedSex", "CL_Occupation", "CL_Marital", "CL_Hypertension", "CL_Heart", "CL_Stroke", "CL_Diabetes", "CL_TB", "CL_HIV", "CL_Cancer", "CL_Asthma","CL_InjuryHistory", "CL_SmokeD", "CL_AlcoholD", "CL_ApplytobaccoD"]
        elif dem in featurenames:
            dict_keys = dict_keys + ["CL_DeathAge", "CL_DeceasedSex"]
        print "dict_keys: " + str(dict_keys)
        #keywords = set([])
        #narrwords = set([])

    print "train: " + str(train)
    print "stem: " + str(stem)
    print "lemma: " + str(lemma)
    # Extract features
    matrix = []
    for child in root:
        features = {}

        if rec_type in featurenames:
            features["CL_" + rec_type] = child.tag

        # CHECKLIST features
        for key in dict_keys:
            if key[0:3] == "CL_":
                key = key[3:]
            item = child.find(key)
            value = "0"
            if item != None:
                value = item.text
            if key == "AlcoholD" or key == "ApplytobaccoD":
                if value == 'N':
                    value = 9
            features[key] = value
            #print "-- value: " + value
            #if key == "MG_ID":
            #    print "extracting features from: " + value

        # KEYWORD features
        if kw_features:
            keyword_string = get_keywords(child)
            # Remove punctuation and trailing spaces from keywords
            words = [s.strip().translate(string.maketrans("",""), string.punctuation) for s in keyword_string.split(',')]
            # Split keyword phrases into individual words
            for word in words:
                w = word.split(' ')
                words.remove(word)
                for wx in w:
                    words.append(wx.strip().strip('–'))
            keywords.append(" ".join(words))
                
        # NARRATIVE features
        if narr_features or ((not train) and (symp_train in featurenames)):
            narr_string = ""
            item = child.find(element)
            if item != None:
                if item.text != None:
                    narr_string = item.text.encode("utf-8")
                else:
                    print "warning: empty narrative"
                narr_words = [w.strip() for w in narr_string.lower().translate(string.maketrans("",""), string.punctuation).split(' ')]
                text = " ".join(narr_words)

                if stem:
                    narr_string = preprocessing.stem(text)
                elif lemma:
                    narr_string = preprocessing.lemmatize(text)
            narratives.append(narr_string.strip().lower())
            #print "Adding narr: " + narr_string.lower()

        # SYMPTOM features
        elif train and (symp_train in featurenames):
            narr_string = ""
            item = child.find("narrative_symptoms")
            if item != None:
                item_text = item.text
                if item_text != None and len(item_text) > 0:
                    narr_string = item.text.encode("utf-8")
                    #narr_words = [w.strip() for w in narr_string.lower().translate(string.maketrans("",""), string.punctuation).split(' ')]
            narratives.append(narr_string.lower())
            print "Adding symp_narr: " + narr_string.lower()

        # Save features
        matrix.append(features)

    # Construct the feature matrix

    # COUNT or TFIDF features
    if narr_count in featurenames or kw_count in featurenames or narr_tfidf in featurenames or kw_tfidf in featurenames or lda in featurenames or symp_train in featurenames:
        documents = []
        if narr_count in featurenames or narr_tfidf in featurenames or lda in featurenames or symp_train in featurenames:
            documents = narratives
            print "narratives: " + str(len(narratives))
        elif kw_count in featurenames or kw_tfidf in featurenames:
            documents = keywords
            print "keywords: " + str(len(keywords))

        # Create count matrix
        global count_vectorizer
        if train:
            print "training count_vectorizer"
            count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(min_ngram,max_ngram),stop_words=stopwords)
            count_vectorizer.fit(documents)
            dict_keys = dict_keys + count_vectorizer.get_feature_names()
        print "transforming data with count_vectorizer"
        count_matrix = count_vectorizer.transform(documents)
        matrix_keys = count_vectorizer.get_feature_names()

        print "writing count matrix to file"
        out_matrix = open(infile + ".countmatrix", "w")
        out_matrix.write(str(count_matrix))
        out_matrix.close()

        # Add count features to the dictionary
        for x in range(len(matrix)):
            feat = matrix[x]
            for i in range(len(matrix_keys)):
                key = matrix_keys[i]
                val = count_matrix[x,i]
                feat[key] = val

        # Convert counts to TFIDF
        if (narr_tfidf in featurenames) or (kw_tfidf in featurenames):
            print "converting to tfidf..."
            print "matrix_keys: " + str(len(matrix_keys))

            # Use the training count matrix for fitting
            if train:
                global tfidfTransformer
                tfidfTransformer = sklearn.feature_extraction.text.TfidfTransformer()
                tfidfTransformer.fit(count_matrix)

            # Convert matrix to tfidf
            tfidf_matrix = tfidfTransformer.transform(count_matrix)
            print "count_matrix: " + str(count_matrix.shape)
            print "tfidf_matrix: " + str(tfidf_matrix.shape)

            # Replace features in matrix with tfidf
            for x in range(len(matrix)):
                feat = matrix[x]
                #values = tfidf_matrix[x,0:]
                #print "values: " + str(values.shape[0])
                for i in range(len(matrix_keys)):
                    key = matrix_keys[i]
                    val = tfidf_matrix[x,i]
                    feat[key] = val

        # LDA topic modeling features
        if lda in featurenames:
            global ldaModel
            if train:
                ldaModel = LatentDirichletAllocation(n_topics=num_topics)
                ldaModel.fit(count_matrix)
            lda_matrix = ldaModel.transform(count_matrix)
            for t in range(0,num_topics):
                dict_keys.append("lda_topic_" + str(t))
            for x in range(len(matrix)):
                for y in range(len(lda_matrix[x])):
                    val = lda_matrix[x][y]
                    matrix[x]["lda_topic_" + str(y)] = val

            # TODO: Print LDA topics

    # WORD2VEC features
    elif narr_vec in featurenames:
        print "Warning: using word2vec features, ignoring all other features"

        # Create word2vec mapping
        word2vec, dim = load_word2vec(vecfile)

        # Convert words to vectors and add to matrix
        dict_keys.append(narr_vec)
        global max_seq_len
        max_seq_len = 200
        #if train:
            #max_seq_len = 0
        print "word2vec dim: " + str(dim)
        print "initial max_seq_len: " + str(max_seq_len)
        zero_vec = []
        for z in range(0, dim):
            zero_vec.append(0)
        for x in range(len(matrix)):
            narr = narratives[x]
            #print "narr: " + narr
            vectors = []
            vec = zero_vec
            for word in narr.split(' '):
                if len(word) > 0:
                    #if word == "didnt":
                    #    word = "didn't"
                    if word in word2vec:
                        vec = word2vec[word]
                    vectors.append(vec)
            length = len(vectors)
            if length > max_seq_len:
                #if train:
                #    max_seq_len = length
                vectors = vectors[(-1*max_seq_len):]
            (matrix[x])[narr_vec] = vectors

        # Pad the narr_vecs with 0 vectors
        print "padding vectors to reach maxlen " + str(max_seq_len)
        for x in range(len(matrix)):
            length = len(matrix[x][narr_vec])
            matrix[x]['max_seq_len'] = max_seq_len
            if length < max_seq_len:
                for k in range(0, max_seq_len-length):
                    matrix[x][narr_vec].insert(0,zero_vec) # use insert for pre-padding

    # narr_seq for RNN
    elif narr_seq in featurenames:
        global vocab_size, max_seq_len
        if train:
            dict_keys.append(narr_seq)
            dict_keys.append('vocab_size')
            dict_keys.append('max_seq_len')
            vocab = set()
            for narr in narratives:
                words = narr.split(' ')
                for word in words:
                    vocab.add(word)
            vocab_size = len(vocab)
            max_seq_len = 0

        sequences = []

        # Convert text into integer sequences
        for x in range(len(matrix)):
            narr = narratives[x]
            seq = hashing_trick(narr, vocab_size, hash_function='md5', filters='\t\n', lower=True, split=' ')
            if len(seq) > max_seq_len:
                max_seq_len = len(seq)
            sequences.append(seq)

        # Pad the sequences
        sequences = pad_sequences(sequences, maxlen=max_seq_len, dtype='int32', padding='pre')
        for x in range(len(matrix)):
            matrix[x]['narr_seq'] = sequences[x]
            matrix[x]['vocab_size'] = vocab_size
            matrix[x]['max_seq_len'] = max_seq_len

    #if arg_rebalance != "":
    #    matrix_re = rebalance_data(matrix, dict_keys, arg_rebalance)
    #    write_to_file(matrix_re, dict_keys, outfile)
    #else:
    data_util.write_to_file(matrix, dict_keys, outfile)
 def convert_sents(doc):
     s = flatten(lemmatize(doc))
     return [x for x in s if len(x) > 1]
 def convert_sents(doc):
   s = flatten(lemmatize(doc))
   return [x for x in s if len(x) > 1]