def _read_csv(self, pathFilesF, pathFilesM):
        '''
        '''
        textProcessing = TextProcessing()

        samplesInClasses = {}
        samplesInClasses["female"] = []
        samplesInClasses["male"] = []

        samplesF = self._read_files(pathFilesF)
        samplesM = self._read_files(pathFilesM)

        iText = 0
        totalText = len(samplesM) + len(samplesF)

        for sample in samplesF:
            nTokens = len(textProcessing.tokenize_one(sample))
            if nTokens > 1:
                #print("Texto " + str(iText) + " / " + str(totalText))
                samplesInClasses["female"].append(sample)
                iText = iText + 1
                #print(metaAttributes.all_meta_attributes)
                #input('-----------------------------')

        for sample in samplesM:
            nTokens = len(textProcessing.tokenize_one(sample))
            if nTokens > 1:
                #print("Texto " + str(iText) + " / " + str(totalText))
                samplesInClasses["male"].append(sample)
                iText = iText + 1

        return samplesInClasses
 def search_by_type_name(self, pathFiles, targetTypes, targetNames):
     textProcessing = TextProcessing()
     from models.text_analysis.nlp_basics.string_analysis import StringAnalysis
     stringAnalysis = StringAnalysis()
     filesFound = []
     nNames = len(targetNames)
     for root, dirs, files in os.walk(pathFiles):
         #Verificar se existe arquivo e se é do tipos alvo
         if len(files) > 0:
             for file in files:
                 tokens = textProcessing.tokenize_one(file.replace(
                     ".", " "))
                 if tokens[len(tokens) - 1] in targetTypes:
                     #Verificar os nomes
                     vFound = numpy.zeros(nNames)
                     for iName in range(0, nNames):
                         fileName = textProcessing.text_lower_one(
                             [tokens[0]])[0]
                         name = textProcessing.text_lower_one(
                             [targetNames[iName]])[0]
                         dist = stringAnalysis.string_in_string_dist(
                             fileName, name)
                         if dist == 1:
                             vFound[iName] = 1
                     '''Somente operação AND'''
                     if numpy.sum(vFound) == nNames:
                         filesFound.append(root + "/" + file)
     #print(len(filesFound))
     #input('------------------')
     return filesFound
예제 #3
0
    def __init__(self, dir=''):
        self.dir = dir
        self.dictionary = Dictionary.load(dir + 'myDictionary')

        self.tp = TextProcessing(dir=dir)

        self.size = 100
예제 #4
0
    def __init__(self, file_in='', file_out='', dir='', n_docs=-1):
        self.file_in = file_in
        self.file_out = file_out
        self.n_docs = n_docs

        self.tp = TextProcessing(dir=dir)

        self.process_corpus()
예제 #5
0
    def __init__(self, corpus_file, n_docs=-1):
        self.corpus_file = corpus_file
        self.n_docs = n_docs

        self.tp = TextProcessing(dir='')

        self.dictionary = Dictionary('')
        self.tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
        self.en_stop = get_stop_words('en')
        self.p_stemmer = PorterStemmer()
 def search_by_type(self, pathFiles, targetTypes):
     textProcessing = TextProcessing()
     filesFound = []
     for root, dirs, files in os.walk(pathFiles):
         #Verificar se existe arquivo e se é do tipos alvo
         if len(files) > 0:
             for file in files:
                 tokens = textProcessing.tokenize_one(file.replace(
                     ".", " "))
                 if tokens[len(tokens) - 1] in targetTypes:
                     filesFound.append(root + "/" + file)
     return filesFound
예제 #7
0
    def __init__(self, dir='', load_dict=False):
        self.dir = dir
        self.tp = TextProcessing(dir=self.dir)

        # create empty dictionary:
        #self.dictionary = Dictionary()
        self.dictionary = Dictionary.load(dir + 'myDictionary')
        self.save_dict = True

        self.tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
        self.en_stop = get_stop_words('en')
        self.p_stemmer = PorterStemmer()
예제 #8
0
def create_training_data():
    data_lst = pickle.load(open('data/harvest.data', 'rb'))
    feature_process.feature_map['source'] = {'Google':1, 'Twitter for iPad':2, 'Echofon':3,
                                             'Bitly':4, 'twitterfeed':5, 'Twitter for iPhone':6,
                                             'Foursquare':7, 'Facebook':8, 'Twitter for Android':9,
                                             'TweetDeck':10, 'Twitter Web Client':11}
    feature_process.feature_map['geo'] = ['None']
    feature_process.feature_map['place'] = ['None']
    feature_process.feature_map['verified'] = ['False']
    feature_process.feature_map['geo_enabled'] = ['False']
    y = []
    x = []
    for i in range(0, len(data_lst)):        
        try:
            label = is_not_important[i]
        except Exception as e:
            label = 1
        
        data = data_lst[i]
        text = TextProcessing.process(data[0])
        source = FeatureMapping.mapping('source', data[1])
        re_tweet = data[2]
        geo = FeatureMapping.mapping_other('geo', data[3])
        place = FeatureMapping.mapping_other('place', data[4])
        hash_tag = data[5]
        media = data[6]
        verified = FeatureMapping.mapping_other('verified', data[7])
        follower = data[8]
        statues = data[9]
        desc = TextProcessing.process(data[10])
        friend = data[11]
        location = TextProcessing.process(data[12])
        geo_enabled = FeatureMapping.mapping_other('geo_enabled', data[13])
        
        y.append(label)
        x.append([text, source, re_tweet, geo, place, hash_tag, media, verified, follower, statues, desc, friend, location, geo_enabled])
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import f1_score, accuracy_score
    clf = RandomForestClassifier()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    fsc = f1_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    print 'f1-score : ',fsc
    print 'accuracy : ',acc
    print y_pred
    print y_test
예제 #9
0
class PreprocessCorpusFile:
    def __init__(self, file_in='', file_out='', dir='', n_docs=-1):
        self.file_in = file_in
        self.file_out = file_out
        self.n_docs = n_docs

        self.tp = TextProcessing(dir=dir)

        self.process_corpus()

    def process_corpus(self):
        fin = open(self.file_in, 'r')
        fout = open(self.file_out, 'w')

        i = 0
        for line in fin.readlines():
            # cleaning the line
            stemmed_tokens = self.tp.clean_line(line)

            # write to file
            fout.write(' '.join(stemmed_tokens))
            fout.write('\n')

            i += 1
            if self.n_docs != -1 and i >= self.n_docs:
                break
            if i % 1000 == 0:
                logging.debug('Sentence %s processed' % i)

        # convert tokenized documents into a document-term matrix
        fin.close()
        fout.close()
예제 #10
0
class Sentences:
    def __init__(self, corpus_file, n_docs=-1):
        self.corpus_file = corpus_file
        self.n_docs = n_docs

        self.tp = TextProcessing(dir='')

        self.dictionary = Dictionary('')
        self.tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
        self.en_stop = get_stop_words('en')
        self.p_stemmer = PorterStemmer()

    def __iter__(self, dict_dir):
        logging.info("Loading corpus in file %s" % self.corpus_file)

        i = 0
        for line in open(self.corpus_file, 'r'):
            # cleaning the line
            stemmed_tokens = self.tp.clean_line(line)

            # add tokens to list
            #ret.append(stemmed_tokens)

            # add line to dictionary
            d2 = Dictionary(stemmed_tokens)
            self.dictionary = self.dictionary.merge_with(d2)

            # count number of documents and break if > num_docs
            i += 1
            if self.n_docs != -1 and i >= self.n_docs:
                break
            if i % 1000 == 0:
                logging.debug('Document %s loaded' % i)
예제 #11
0
def create_training_data():
    data_lst = pickle.load(open('data/harvest.data', 'rb'))
    feature_process.feature_map['source'] = {'Google':1, 'Twitter for iPad':2, 'Echofon':3,
                                             'Bitly':4, 'twitterfeed':5, 'Twitter for iPhone':6,
                                             'Foursquare':7, 'Facebook':8, 'Twitter for Android':9,
                                             'TweetDeck':10, 'Twitter Web Client':11}
    feature_process.feature_map['geo'] = ['None']
    feature_process.feature_map['place'] = ['None']
    feature_process.feature_map['verified'] = ['False']
    feature_process.feature_map['geo_enabled'] = ['False']
    y = []
    x = []
    for i in range(0, len(data_lst)):        
        try:
            label = is_not_important[i]
        except Exception as e:
            label = 1
        
        data = data_lst[i]
        text = TextProcessing.process(data[0])
        source = FeatureMapping.mapping('source', data[1])
        re_tweet = data[2]
        geo = FeatureMapping.mapping_other('geo', data[3])
        place = FeatureMapping.mapping_other('place', data[4])
        hash_tag = data[5]
        media = data[6]
        verified = FeatureMapping.mapping_other('verified', data[7])
        follower = data[8]
        statues = data[9]
        desc = TextProcessing.process(data[10])
        friend = data[11]
        location = TextProcessing.process(data[12])
        geo_enabled = FeatureMapping.mapping_other('geo_enabled', data[13])
        
        y.append(label)
        x.append([text, source, re_tweet, geo, place, hash_tag, media, verified, follower, statues, desc, friend, location, geo_enabled])
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import f1_score, accuracy_score
    clf = RandomForestClassifier()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    fsc = f1_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    print fsc, acc
    print y_pred
    print y_test
    def _read_files(self, pathFiles):
        import sys
        textProcessing = TextProcessing()
        fileSearch = FileSearch()
        files = fileSearch.search_by_type(pathFiles, "csv")
        samples = []
        csv.field_size_limit(sys.maxsize)
        for file in files:
            csvfile = open(file, newline='')
            print(file)
            csvreader = csv.reader(csvfile, delimiter=';', quotechar='|')
            #print(csvreader)
            for row in csvreader:
                nTokens = len(textProcessing.tokenize_one(' '.join(row)))
                if nTokens < 50:
                    #print(row)
                    samples.append(' '.join(row))

        return samples
예제 #13
0
import os,heapq
import shelve
from collections import Counter
from typing import Union, List, Tuple
from tqdm import tqdm
from utils import timer,load_wapo
from text_processing import TextProcessing

text_processor = TextProcessing.from_nltk()
# include your customized text processing class


@timer
def build_inverted_index(
    wapo_jl_path: Union[str, os.PathLike], index_shelve_path: str
) -> None:
    """
    load wapo_pa3.jl to build the inverted index and store the index as a shelf in the provided path
    :param wapo_jl_path:
    :param index_shelve_path:
    :return:
    """


    # Note: Generating inverted index and then assigning it to shelf --> big speed improvement
    #---> but doing so ignores the whole point of using shelf for the index
    # Current iteration takes about 15-25 minutes to run
    with shelve.open(index_shelve_path,flag='n',writeback=True) as index:
        index["___count"] = Counter() #this is used for analysis in custom processing
        for doc in load_wapo(wapo_jl_path):
            normal_tokens, stops= text_processor.get_normalized_tokens(doc['title'],doc['content_str'])
예제 #14
0
    def __init__(self, text):
        import time
        start = time.clock()
        '''
        -----------------------------------------------------------------------------------------------------------------------
        DEFINICAO DOS PARAMETROS DE CONTROLE
        -----------------------------------------------------------------------------------------------------------------------
        '''        
        tp = TextProcessing()
        
        self.nMaxLengthFreq = 16 
#       OBS1: Tamanho maximo de palavra a ser considerado na frequencia do tamanho de palavras       
        savePath = "/home/ahirton/Python/gender_classification/outputfiles/"
        #savePath = "/home/rpasti/workspace/gender_classification/outputfiles/"
        tagged = tp.tagging([tp.tokenize([text])[0]],savePath,"en")[0]
        fileUtils = FileUtils(savePath)
        
        text = re.sub("http","", text)
        self.raw = text
        
#        print tagged

        self.PARAGRAPHS = []
        self.SENTENCES = []
        self.WORDS = []
        delimiters = '\n','. \n', '! \n', '?\n', '.\n', '!\n', '?\n', '... \n' #, '... \n'#, ' \n ' #, " .\n", " !\n", ' ?\n'
        regexPattern = '|'.join(map(re.escape, delimiters))
       
        for paragraph in re.split(regexPattern,self.raw):        
            p = []
#            print ""
#            print paragraph            
#            raw_input(".----------------.FIM DE PARÁGRAFO----------------.")
            #sentences = tp.tokenize_sentence([paragraph])[0]
            for sentence in tp.tokenize_sentence([paragraph])[0]: 
#                print ""
#                print sentence
#                print tp.tagging(tp.tokenize([sentence]))
#                raw_input(".---------------..FIM DE FRASE...------.")
                words = tp.tokenize([sentence])[0]
                #words = tp.remove_punctuation([words])[0]
                self.WORDS.extend(words)
                self.SENTENCES.append(sentence)
                p.append(words)
#                print paragraph
#                print sentence
#                print words
#                print self.WORDS
#                raw_input('XXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
            self.PARAGRAPHS.append(p)
            self.C = len(text)
            self.LOWER = MetaAttributes._count_char(text, "^[a-z_-]*$")
            self.UPPER = MetaAttributes._count_char(text, "^[A-Z_-]*$")
            self.NUMBERS = MetaAttributes._count_char(text, "^[\d]*$")
            self.WHITE = MetaAttributes._count_char(text, "^[ ]*$")
            self.TAB = MetaAttributes._count_char(text, "^[\t]*$")
            self.N = len(self.WORDS)
            self.SIZES = []
            self.FREQ = {}
        
        for w in self.WORDS:            
            self.SIZES.append(len(w))
            self.FREQ = dict(nltk.FreqDist(self.WORDS))
            self.V = dict(nltk.FreqDist(self.FREQ.values())) 
            self.VRICH = self.N - len(self.V)
            self.HXLEGO = []
            self.HXDISLEGO = []

        for w, t in self.FREQ.items():
            if t == 1:
                self.HXLEGO.append(w)
            elif t == 2:
                self.HXDISLEGO.append(w)
                
            self.TAGGED = tagged
            self.S = len(self.SENTENCES)
            
        self.pwdictionary = semantic_dictionaries.extended_positive()
        self.nwdictionary = semantic_dictionaries.extended_negative()
        self.neutralwdictionary = semantic_dictionaries.extended_neutral_words()
        self.LIWCdict = fileUtils.load_object("liwc", "dict")
예제 #15
0
from read_data import ReadData
from text_processing import TextProcessing

st.set_page_config(layout="wide")

st.markdown("<h1 style='text-align: center; color: black;'>Multipurpose Natural Language Processing App</h1>", 
            unsafe_allow_html=True)
st.markdown(Config.hide_streamlit_style, unsafe_allow_html=True)

data_choice = st.radio("Select your preferred way of data input", ('Upload a file', 'Direct text input'))

if data_choice == 'Upload a file':
    uploaded_file = st.sidebar.file_uploader("Upload your file:", type=['txt'])
    read_obj = ReadData(uploaded_file)
    data = read_obj.read_file_txt()
    input_type = True

else:
    data = st.text_input('Input your text here:')
    input_type = False

if data is not None:
    model_option = st.selectbox("Please choose your intended model:", ["Text Summarization"])
    process_obj = TextProcessing(data)
    cleaned_data = process_obj.text_cleaning(input_type)

    

    
    
    
예제 #16
0
class MyWord2Vec:
    def __init__(self, dir=''):
        self.dir = dir
        self.dictionary = Dictionary.load(dir + 'myDictionary')

        self.tp = TextProcessing(dir=dir)

        self.size = 100

    def load_corpus(self, file_name, num_docs):
        texts = []
        i = 0
        for line in open(file_name, 'r'):
            # cleaning the line
            stemmed_tokens = self.tp.clean_line(line)

            # add tokens to list
            texts.append(stemmed_tokens)

            # count number of documents and break if > num_docs
            i += 1
            if num_docs != -1 and i >= num_docs:
                break

        # convert tokenized documents into a document-term matrix
        return texts

    def train_model(self, file_name='corpus.txt', num_docs=-1, size=100):
        self.size = size

        # generate corpus
        #corpus = self.load_corpus(file_name, num_docs)
        corpus = LineSentence(file_name, limit=num_docs)

        # generate Word2Vec model
        model = Word2Vec(corpus, size=size, window=5, min_count=10, workers=3)
        return model

    def update_model(self, model, file_name, num_docs=-1):
        # generate new corpus
        corpus = self.load_corpus(file_name, num_docs)

        # generate Word2Vec model
        model.update(corpus)

    def get_word_embedding(self, model, word):
        if word in model.wv.vocab:
            vec = model.wv[word]
        else:
            w_clean = self.tp.clean_word(word)
            if w_clean in model.wv.vocab:
                vec = model.wv[w_clean]
            else:
                vec = np.zeros(self.size)

        return vec

    def get_sentence_embedding(self, model, line):
        words = self.tp.clean_line(line)
        vec = np.zeros(self.size)

        n_words = 0
        for w in words:
            if w in model.wv:
                vec += model.wv[w]
                n_words += 1

        if n_words > 0:
            return vec / n_words
        else:
            return vec

    def save_model(self, model):
        model.save(self.dir + 'myW2Vmodel')
        #self.dictionary.save('myDictionary')

    def load_model(self):
        model = Word2Vec.load(self.dir + 'myW2Vmodel')
        return model
예제 #17
0
def main():
    input_directory = sys.argv[1]
    train_size = int(sys.argv[2])
    test_size = (100 - train_size) / 100

    ##### Step 1: Data Loading and Basic stats #####
    t0 = time()

    print()
    print('** STEP 1: Data Loading **')
    dl_obj = DataLoading()
    base_df = dl_obj.clean_data(input_directory)
    #prodid_ix = base_df.id.values
    #base_df = base_df.reindex(prodid_ix)

    ## This line should be removed ##
    #print('Only 1000 rows are loaded')
    #base_df = base_df.sample(10000, random_state = 123)

    target_matrix = dl_obj.get_multilabel(base_df)
    #target_matrix = target_matrix.reindex(prodid_ix)

    dl_obj.get_label_info(target_matrix)

    #### Step 2: feature Engineering #####

    print()
    print('** STEP 2: Text Processing **')
    tp_obj = TextProcessing()
    cnt_vectorizer, feature_matrix = tp_obj.run_textprocessing(base_df)

    feature_matrix = pd.DataFrame(feature_matrix.toarray())
    feature_matrix = feature_matrix.join(
        base_df[['vegetarian', 'spicy', 'garlic', 'fish']])
    feature_matrix.fillna(0, inplace=True)

    #### Step 3:
    ### STEP 1: Normalize the labels ###
    print()
    print('** Filter Rare Labels combination **')
    util = Utility()
    print("Feature Matrix Shape:{} Target Matrix.shape: {}"\
            .format(feature_matrix.shape, target_matrix.shape))
    feature_matrix_fltrd, target_matrix_fltrd = util.filter_rare_classes(
        feature_matrix, target_matrix)
    print("Feature Matrix Shape:{} Target Matrix.shape: {}"\
            .format(feature_matrix_fltrd.shape, target_matrix_fltrd.shape))# (18340,3763)

    ### STEP 2: Train Test Split using StratifiedShuffleSplit #####
    print()
    print('** Train test split **')
    train_x, train_y, test_x, test_y = util.train_test_split(
        feature_matrix_fltrd, target_matrix_fltrd, test_size=test_size)
    print("Train_x Shape:{} \n Train_y.shape: {}"\
            .format(train_x.shape, train_y.shape)) # 14672
    print("Test_x Shape:{} \n Test_y.shape: {}"\
            .format(test_x.shape, test_y.shape)) # 3668

    ### Delete unnecssary files from memory ##

    ### STEP 3: Find Frequnet Itemsets on training target matrix ####
    print()
    print('** STEP 3: Frequent Itemset **')

    col_mapping = {}
    for i_col, col_name in enumerate(target_matrix.columns.tolist()):
        col_mapping[i_col] = col_name

    supp = 0.05
    item_size = 3
    train_y_lil = lil_matrix(train_y)
    frequent_items_list = util.find_frequent_itemsets(train_y_lil, col_mapping,
                                                      supp, item_size)
    print('No of {} frequent itemsets with support {}: {} '\
           .format(item_size
                  , supp
                  , len(frequent_items_list))) #21 itemsets

    freq_additives_list = [
        items for itemset in frequent_items_list for items in itemset
    ]
    freq_additives_set = list(
        set([items for itemset in frequent_items_list for items in itemset]))
    freq_additives_cnt_dict = dict(Counter(freq_additives_list).items())

    #del base_df,target_matrix,target_matrix_fltrd, feature_matrix, feature_matrix_fltrd
    #gc.collect()

    ### STEP 4.1: Build 21 classifiers using Naive Bayes ####
    print()
    print('** STEP 4: LabelPowerSet Classifiers**')
    lp = LabelPowerSet(train_x, train_y, test_x, test_y, frequent_items_list,
                       'nb')

    model_list, metrics_labels, metrics_score, prediction_list = lp.build_model_lp(
    )
    index_value = [''.join(items) for items in frequent_items_list]
    metrics_labels_df = pd.DataFrame(
        metrics_labels,
        columns=['Accuracy', 'HammingLoss', 'JaccardSim'],
        index=index_value)
    metrics_score_df = pd.DataFrame(
        metrics_score,
        columns=['CoverageError', 'LblRankAvgPrec', 'LblRankLoss', 'LogLoss'],
        index=index_value)
    pickle.dump(model_list, open('LP_NB_21FSS.pkl', 'wb'))
    del model_list, lp
    metrics_labels_df.to_csv(input_directory + 'LP_NB_metrics_labels.csv')
    metrics_score_df.to_csv(input_directory + 'LP_NB_metrics_score.csv')

    ####### STEP 4.1: stack the predictions ############
    final_predictions = pd.DataFrame(np.zeros(
        test_y[freq_additives_set].shape),
                                     columns=freq_additives_set)
    for i_model in range(len(prediction_list)):
        #i_model = 0
        prediction = prediction_list[i_model]
        for col in prediction.columns:
            final_predictions[col] = final_predictions[col] + prediction[col]

    final_predictions_2 = final_predictions.apply(
        lambda x: x / freq_additives_cnt_dict[x.name])
    final_predictions_2 = final_predictions_2.applymap(lambda x: 1
                                                       if x >= 0.5 else 0)

    print()
    print('** Evaluation metrics : Majority Voting**')
    eval_metrics = EvaluationMetrics()
    eval_final = eval_metrics.get_classification_report_1(
        test_y[freq_additives_set], final_predictions_2, verbose=1)

    #### STEP 5: Build Binary Relevance models ####

    print()
    print('** STEP 5 : Binary Relevance Classifiers **')
    br = BinaryRelevance()
    label_df, score_df, classifier_list = br.build_model(
        train_x, train_y, test_x, test_y)
    pickle.dump(classifier_list, open('BR_NB_classifiersList.pickle', 'wb'))

    print()
    print('** Evaluation Metrics for BR Classfiers **')
    eval_metrics.get_classification_report_1(test_y[label_df.columns],
                                             label_df)
    # Accurcay : 0.42 Hamming Loss: 0.05, Jaccard Similarity :0.62

    eval_metrics.get_classification_report_2(test_y[label_df.columns],
                                             score_df)
    # CoverageError : 5.61, LabelRankingAvgPrec :0.83, LabelRankingLoss : 0.04, Log_loss = 6.7

    ######## Binary Relevance predictions for frequent labels #####
    print()
    print('** BR classifiers evaluation for labels in frequentitemset **')
    eval_metrics.get_classification_report_1(test_y[freq_additives_set],
                                             label_df[freq_additives_set])

    ### STEP 6: Final Predictions #########
    print()
    print('** STEP 6 : Final Predictions **')
    final_predictions_3 = pd.DataFrame(np.zeros(label_df.shape),
                                       columns=label_df.columns)

    ### Binary Relevance + LabelPowerset #####
    for col in final_predictions_3.columns:
        if col in freq_additives_set:
            final_predictions_3[col] = final_predictions_2[col]
        else:
            final_predictions_3[col] = label_df[col]

    print()
    print('** Evaluation Metrics for Final Predcition **')
    print('test_ shape', test_y[label_df.columns].shape)
    print('final predictions', final_predictions_3.shape)
    eval_final_2 = eval_metrics.get_classification_report_1(
        test_y[label_df.columns], final_predictions_3, verbose=1)

    ### STEP 7: Dumping Predictions ##########
    print()
    print('** STEP 7 : Saving Predictions **')
    test_y.to_csv('test_actual_labels.csv')
    final_predictions_3.to_csv('test_final_predicted_labels.csv')
    score_df.to_csv('test_scoring_from_br.csv')

    print('Entire Process completed in {} seconds'.format(time() - t0))
from tqdm import tqdm
import random
from textblob import TextBlob
import numpy as np
from sklearn.linear_model import LogisticRegression
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
import torch
from keras import backend as K

print(K.tensorflow_backend._get_available_gpus())
logger = log.setup_custom_logger('analysis')
percent_of_data = .5  # 1% of the lines
sentiment_threshold = 0
text_process = TextProcessing()

tqdm.pandas()


def get_data(filename):
    #keep the header, take random rows
    logger.info("Reading {} percent of the data.".format(percent_of_data *
                                                         100))
    data = pd.read_csv(
        filename,
        header=0,
        skiprows=lambda index: index > 0 and random.random() > percent_of_data)
    logger.info('Data Size Read : {}'.format(len(data)))
    logger.info('Dropping NAs')
    data = data.dropna()
예제 #19
0
class LDA:
    def __init__(self, dir='', load_dict=False):
        self.dir = dir
        self.tp = TextProcessing(dir=self.dir)

        # create empty dictionary:
        #self.dictionary = Dictionary()
        self.dictionary = Dictionary.load(dir + 'myDictionary')
        self.save_dict = True

        self.tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
        self.en_stop = get_stop_words('en')
        self.p_stemmer = PorterStemmer()

    def clean_line(self, line):
        raw = line.lower()
        tokens = self.tokenizer.tokenize(raw)

        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in self.en_stop]

        # stem tokens
        r = []
        for i in stopped_tokens:
            try:
                r.append(self.clean_word(i))
            except:
                logging.info("Can't process word %s" % i)
        return r

    def clean_word(self, word):
        stemmed_word = self.p_stemmer.stem(word)
        return stemmed_word

    def load_corpus(self, file_name, num_docs):
        logging.info("Loading corpus in file %s" % file_name)
        texts = []
        i = 0
        for line in open(file_name, 'r'):
            # cleaning the line
            stemmed_tokens = self.tp.clean_line(line)

            # add tokens to list
            texts.append(stemmed_tokens)

            # count number of documents and break if > num_docs
            i += 1
            if num_docs != -1 and i >= num_docs:
                break
            if i % 1000 == 0:
                logging.debug('Document %s loaded' % i)

        # turn our tokenized documents into a id <-> term dictionary
        #if len(self.dictionary) == 0:
        #self.dictionary = Dictionary(texts)
        #self.dictionary.save(self.dir + 'myDictionary')
        '''else:
            # self.dictionary.merge_with(Dictionary(texts))
            pass'''

        # convert tokenized documents into a document-term matrix
        return [self.dictionary.doc2bow(text) for text in texts]

    def train_model(self,
                    file_name='corpus.txt',
                    num_docs=-1,
                    num_topics=50,
                    passes=20,
                    multicore=False):
        # generate LDA model
        if not multicore:
            corpus = self.load_corpus(file_name, num_docs)
            ldamodel = LdaModel(corpus,
                                num_topics=num_topics,
                                id2word=self.dictionary,
                                passes=passes)
        else:
            corpus = Sentences(file_name, num_docs)
            ldamodel = LdaMulticore(corpus.__iter__(),
                                    num_topics=num_topics,
                                    id2word=self.dictionary,
                                    passes=passes,
                                    workers=3)

        return ldamodel

    def update_model(self, ldamodel, file_name, num_docs=-1):
        # generate new corpus
        corpus = self.load_corpus(file_name, num_docs)

        # generate LDA model
        ldamodel.update(corpus)

    def get_document_topics(self, ldamodel, text, n=1):
        text = self.tp.clean_line(text)
        bow = self.dictionary.doc2bow(text)

        if n == 1:
            return ldamodel.get_document_topics(bow, minimum_probability=0)

        list_d = []
        keys = set()
        for _ in range(n):
            d = dict(ldamodel.get_document_topics(bow))
            list_d.append(d)
            for k in d.keys():
                keys.add(k)

        probs = []
        for k in keys:
            mean = 0
            for i in range(n):
                if k in list_d[i].keys():
                    mean += list_d[i][k]
            probs.append((k, mean / n))
        return probs

    def show_topic_words(self, ldamodel, topic_id, topn=10):
        list = ldamodel.get_topic_terms(topic_id, topn=topn)
        r = []
        for w_id, p in list:
            print(self.dictionary[w_id], ' \t ', p)
            r.append((self.dictionary[w_id], p))
        return r

    def save_model(self, ldamodel):
        ldamodel.save(self.dir + 'myLDAmodel')

    def load_model(self):
        return LdaModel.load(self.dir + 'myLDAmodel')
예제 #20
0
from nltk.stem.porter import PorterStemmer  # type: ignore
from nltk.stem import SnowballStemmer
from text_processing import TextProcessing
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords  # type: ignore
from pathlib import Path
"""
    I made this simple script to test the effectiveness of three popular stemming
    algorithms on the number of tokens returned. In increacing agressiveness:
    - Porter
    - Snowball
    - Lancaster
"""

snow = TextProcessing(stemmer=SnowballStemmer('english').stem,
                      stop_words=stopwords.words("english"))
port = TextProcessing.from_nltk()
lan = TextProcessing(stemmer=LancasterStemmer().stem,
                     stop_words=stopwords.words("english"))
from utils import load_wapo

data_dir = Path("pa3_data")
wapo_path = data_dir.joinpath("wapo_pa3.jl")
ss = set()
ps = set()
ls = set()

for doc in list(load_wapo(wapo_path))[:200]:
    ss = ss.union(snow.get_normalized_tokens("", doc['content_str'])[0])
    ps = ps.union(port.get_normalized_tokens("", doc['content_str'])[0])
    ls = ls.union(lan.get_normalized_tokens("", doc['content_str'])[0])