Python TextPreprocessing示例，TextPreprocessing Python示例

示例#1

0

显示文件

def GetSentimentForText(_text):
    review = str(_text)
    review = tp.expand_contractions(review)
    review = tp.scrub_words(review)
    review = tp.remove_accented_chars(review)

    encoding = config.TOKENIZER.encode_plus(
    review,
    max_length=config.MAX_LEN,
    add_special_tokens=True, # Add '[CLS]' and '[SEP]'
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',  # Return PyTorch tensors
    )

    input_ids = encoding['input_ids'].to(config.device)
    attention_mask = encoding['attention_mask'].to(config.device)
    output = md.model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)
    print(f'Review text: {review}')
    print(f'Sentiment  : {config.class_names[prediction]}')
    return config.class_names[prediction]

示例#2

0

显示文件

def getWordCounts(articles, useAbstract=True):
    """
    Get for each calendar week the word frequencies of all occuring words in the headlines 
    (and optionally also abstracts) in the given dataset of ``articles``.

    Parameters
    ----------
    articles : dict
        Dict of news articles in JSON format.
    useAbstract : bool
        Specifies whether the abstract should also be used - only available for NYT (default is True).

    Returns
    -------
    dict
        Dict of dicts for each calendar week with word frequencies.

    """
    result = {}
    articles = get_articles_as_list(articles)
    for a in articles:
        key = (getYear(a['pub_date']), getCalendarWeek(a['pub_date']))
        if key not in result:
            result[key] = {}
        for w in txt.parseSentence(a['headline']):
            if w not in result[key]:
                result[key][w] = 1
            else:
                result[key][w] += 1
        if useAbstract and "abstract" in a:
            for w in txt.parseSentence(a['abstract']):
                if w not in result[key]:
                    result[key][w] = 1
                else:
                    result[key][w] += 1
    return result

示例#3

0

显示文件

文件： Classification.py 项目： RemiFlicoteauxMasterDS/Clef2018

class ClassificationPreprocessing:
    def __init__(self):

        self.tp = TextPreprocessing()
        self.encoder = preprocessing.MultiLabelBinarizer()
        self.word_classes_indexes = {}

    def fit_from_text(self, ref_file):
        sentences = []
        cl = []
        origin = []
        with open(ref_file, 'r', encoding="utf-8") as f:
            next(f)
            for line in f:
                line_ = line.split(';')
                if line_[1] == 'T':
                    c = line_[7]
                    c = re.sub(r'[\+\.\-†*!\s]', '', c)
                    cl.append(c)

        self.encoder.fit(cl)
        self.n_classes = len(self.encoder.classes_)
        self.class_to_index = dict(
            zip(self.encoder.classes_, range(self.n_classes)))

    def fit(self, y):

        self.encoder.fit(y)
        self.n_classes = len(self.encoder.classes_)
        self.class_to_index = dict(
            zip(self.encoder.classes_, range(self.n_classes)))

    def fit_word_indexes(self, index, c_index, class_to_exclude=[]):

        for c, s in zip(c_index, index):
            if c not in class_to_exclude:
                s = self.tp.tokenizer(s)

                for w in s:

                    if w not in self.word_classes_indexes:
                        self.word_classes_indexes[w] = set()

                    self.word_classes_indexes[w].update(
                        [self.class_to_index[c]])

        #np.save('word_codes_indexes',self.word_codes_indexes)
        print('Index word size =', len(self.class_to_index))

示例#4

0

显示文件

文件： main_run.py 项目： chenxichen95/GP-epilepsy_qa_system

import TextPreprocessing
import pymongo
import os

if __name__ == "__main__":
    client = pymongo.MongoClient("mongodb://localhost:27017")
    db = client['epilepsy_qa_system_demo_data']
    collectionNames = db.collection_names()
    SEED = 1000

    # get epilepsy dataset from demo_data
    illnessSet_epilepsy = TextPreprocessing.processingData(
        isEpilepsy=True, collectionNames=collectionNames, db=db)

    # get no-epilepsy dataset from demo_data
    illnessSet_noEpilepsy = TextPreprocessing.processingData(
        isEpilepsy=False, collectionNames=collectionNames, db=db)

    # build negative data
    illnessSet_epilepsy_negative = TextPreprocessing.buildNegative(
        illnessSet_epilepsy,
        illnessSet_noEpilepsy,
        rate=1,
        seed=SEED,
    )

    dataset_trainAndDev, dataset_test = TextPreprocessing.deal_concat_random_cut(
        illnessSet_epilepsy, illnessSet_noEpilepsy)

    # save dataset
    dataPath = '../data'

示例#5

0

显示文件

文件： sentiment_streamlit_app.py 项目： rmbright19/basic-ml-deployment

import streamlit as st
import pandas as pd
import numpy as np
import TextPreprocessing as tp
import pickle

loaded_classifier = pickle.load(open('sentiment_svc.pickle', 'rb'))
loaded_vectorizer = pickle.load(open('sentiment_tfidf.pickle', 'rb'))

st.title("Twitter Sentiment Analysis App")
text = st.text_input("Tweet to analyze:")
st.write("Tweet:")
st.write(text)

# Preprocess text
text_cleaned = tp.clean_tweet(text)

transformed_data = loaded_vectorizer.transform([text_cleaned])
prediction = loaded_classifier.predict(transformed_data)

if prediction[0] == 0:
    prediction_name = "Negative"
elif prediction[0] == 1:
    prediction_name = "Neutral"
else:
    prediction_name = "Positive"

st.write(f"Sentiment: {prediction_name}")

示例#6

0

显示文件

class FeatureExtractor():

    def __init__(self):
        
        self.token_index = {}
        self.dictionary={}
        self.text_preprocessing=TextPreprocessing()

        
        
    def fit(self,sentences,START_VOCAB=[]):
        
        tokenized_sent = [self.text_preprocessing.tokenizer(s) for s in sentences]
        self.dictionary, self.rev_dictionary = build_vocabulary(tokenized_sent,START_VOCAB)
                



    def barket_removal(self,s):

        all_=re.findall('\((.*?)\)',s)

        if len(all_)!=0:

            all_=[w for w in all_]+['']

            _s_=[]
            s_=re.split(r'\(.*?\)',s)
            if ' ' in s_:
                s_.remove(' ')

            for i,sp in enumerate(all_):
                s__=''

                for j,w in enumerate(s_):
                    if j==0:
                        s__=w+sp

                    else:
                        s__=s__+w

                s__=re.sub(r'\s{2,}',' ',s__)
                _s_.append(s__)

            return(_s_)
        else:
            return([s])
    
    def square_barket_removal(self,s):
        all_= re.findall('\[(.*?)\]',s)

        if len(all_)!=0:
            _s_=[]
            all_=[w for w in all_]

            for i,sp in enumerate(all_):


                s_=re.split(r'\[.*?\]',s)
                if ' ' in s_:
                    s_.remove(' ')
                del s_[-0]
                s___=sp+' '.join(s_)
                s___=re.sub(r'\s{2,}',' ',s___)
                _s_.append(s___)

                s_=re.split(r'\[.*?\]',s)
                if ' ' in s_ and len(s_)>2:
                    s_.remove(' ')

                s___=' '.join(s_)
                s___=re.sub(r'\s{2,}',' ',s___)
                _s_.append(s___)

                s___=s_[0]+sp+s_[1]
                s___=re.sub(r'\s{2,}',' ',s___)
                _s_.append(s___)
            return(_s_+all_)

        else:
            return [s]
    
    def features_from_tokens(self,tokenize_sent,max_length):
        
        return vectorize_corpus(tokenize_sent,max_length,self.dictionary)

示例#7

0

显示文件

 def __init__(self):
     
     self.token_index = {}
     self.dictionary={}
     self.text_preprocessing=TextPreprocessing()

示例#8

0

显示文件

def restore_keyword(keyword,
                    articles,
                    start=None,
                    end=None,
                    searchrange=None,
                    minLength=2,
                    minCount=5,
                    useAbstract=True):
    """
    Get subsequences of words which contain single ``keyword``.

    Searches all subsequences of words which contain the ``keyword`` in headlines and abstracts for the given dataset of ``articles``.

    Notes
    -----
    1.) In contrast to ``get_cooccurrences``, here we consider the positional distance of a word to the keyword in the headline/abstract.
    2.) Using ``searchrange`` increases the performance.

    Parameters
    ----------
    keyword : str
        Keyword in ``articles``.
    articles : dict
        Dict of news articles in JSON format.
    start : datetime.date
        Search is limited to articles which were published on or after this day (defaut is None).
    end : datetime.date
        Search is limited to articles which were published on or before this day (defaut is None).
    searchrange : int
        Only consider words within this maximum (symmetric) positional distance to the keyword in the headline/abstract (defaut is None).
    minLength : int
        Minimum length (number of words) of the subsequence (defaut is 2).
    minCount : int
        Minimum amount of occurrences of the subsequence (defaut is 5).
    useAbstract : bool
        Specifies whether the abstract should also be used - only available for NYT (default is True).

    Returns
    -------
    list
        Sorted (descending count) list of tuples with format: (word_sequence, count)

    """
    data = []
    articles = get_articles_as_list(articles)
    for a in articles:
        match = True
        if "abstract" in a and useAbstract:
            content = txt.parseSentence(a["headline"] + " " + a["abstract"])
        else:
            content = txt.parseSentence(a["headline"])
        if keyword not in content:
            match = False
        if match and (start != None or end != None):
            date = parse_pubdate(a["pub_date"])
            if start != None:
                if date < start:
                    match = False
            if end != None:
                if date > end:
                    match = False
        if match:
            if searchrange != None:
                for index in [
                        i for i, v in enumerate(content) if v == keyword
                ]:
                    data.append(
                        content[max(index - searchrange -
                                    1, 0):min(index +
                                              searchrange, len(content))])
            else:
                data.append(content)
    substring_counts = subsequence_counts(data,
                                          minLength=minLength,
                                          minCount=minCount)
    result = {}
    for el in substring_counts:
        if keyword in el[0]:
            result[" ".join(el[0])] = el[1]
    return [(k, result[k])
            for k in sorted(result, key=result.get, reverse=True)]

示例#9

0

显示文件

def get_group_cooccurrences(keywords,
                            articles,
                            starts=None,
                            ends=None,
                            useAbstract=True):
    """
    Get co-occurring words for multiple ``keywords``.

    Searches all words which occur together with words from ``keywords`` in headlines and abstracts for the given dataset of ``articles``.

    Notes
    -----
    In contrast to ``restore_keyword``, here we do not consider the positional distance of a word to the keyword in the headline/abstract. 

    Parameters
    ----------
    keywords : list
        List of keywords in ``articles``.
    articles : dict
        Dict of news articles in JSON format.
    starts : list of datetime.date
        Search is limited to articles which were published on or after this day depending on keyword (defaut is None).
    ends : list of datetime.date
        Search is limited to articles which were published on or before this day depending on keyword (defaut is None).
    useAbstract : bool
        Specifies whether the abstract should also be used - only available for NYT (default is True).

    Returns
    -------
    dict
        Dict containing for each keyword a sorted (descending count) list of tuples with format: (co-occurring_keyword, count)

    """
    result = {}
    for keyword in keywords:
        result[keyword] = {}
    articles = get_articles_as_list(articles)
    for a in articles:
        if "abstract" in a and useAbstract:
            content = txt.parseSentence(a["headline"] + " " + a["abstract"])
        else:
            content = txt.parseSentence(a["headline"])
        for keyword in keywords:
            match = True
            if keyword not in content:
                match = False
            if match and (starts != None or ends != None):
                date = parse_pubdate(a["pub_date"])
                if starts[keyword] != None:
                    if date < starts[keyword]:
                        match = False
                if ends[keyword] != None:
                    if date > ends[keyword]:
                        match = False
            if match:
                for cooccurrence in content:
                    if cooccurrence != keyword:
                        if cooccurrence not in result[keyword]:
                            result[keyword][cooccurrence] = 1
                        else:
                            result[keyword][cooccurrence] += 1
    for keyword in keywords:
        result[keyword] = [(k, result[keyword][k]) for k in sorted(
            result[keyword], key=result[keyword].get, reverse=True)]
    return result

示例#10

0

显示文件

        return result

    def get_identified_species_count(self, columns):
        res = []
        for _, row in self.csv.iterrows():
            current_comb = ""
            for col in columns:
                current_comb += str(row[col])
            res.append(current_comb)
        c = Counter(res)
        return len(c)

    def search_combinations(self):
        print("Searching combinations...")
        result = self.random_search_column_combinations(100, 20)[:5]
        amount, speciescount, score, count, combinations = result[0]
        print(str(len(combinations)), "channels can distinguish", str(round(score * 100, 2)) + "%", "of all species")
        text_file = open(data_paths.most_common_values_best_features, "w")
        text_file.write("Amount: " + str(amount) + "\n")
        text_file.write("Species count: " + str(speciescount) + "\n")
        text_file.write("Score: " + str(score) + "\n")
        text_file.write("Features count: " + str(count) + "\n")
        text_file.write("Features: '" + "', '".join(x for x in combinations) + "'")
        text_file.close()
    
if __name__ == "__main__":
    ImageToCSVConverter.extract_occurences_train()
    TextPreprocessing.extract_train()
    MostCommonValueExtractor.extract()
    MostCommonValuesUniqueCountDiagram().search_combinations()
    #MostCommonValuesUniqueCountDiagram().plot()

示例#11

0

显示文件

文件： Classification.py 项目： RemiFlicoteauxMasterDS/Clef2018

    def __init__(self):

        self.tp = TextPreprocessing()
        self.encoder = preprocessing.MultiLabelBinarizer()
        self.word_classes_indexes = {}

示例#12

0

显示文件

 def __init__(self):
     csv, species, species_c = TextPreprocessing.load_train()
     self.csv = csv
     self.species = species
     self.species_count = species_c