def getcleantext(sourcefile=Maildata, splitsents=False, min_v=10, max_v=25000): filename = textfilename dfxls = readdata(sourcefile=sourcefile) mailtext = getstrings(filename=filename, dataframe=dfxls) mailtext = mailtext.replace('\n', ' ') mailtext = ReplaceTokens(mailtext) mailwords = [] if splitsents is True: mailwords = removewords(inputtext=mailtext, min_v=min_v, max_v=max_v, stopword=stopword) return(dfxls, mailtext, mailwords)
def get_data(source): df = readdata(sourcefile=source) # dataframe met alleen de verzonden mails df = df[df.status == 1] df = df.groupby(['conversationID'])['tekst'].first() return df.tolist()
from gensim.models import Word2Vec from gensim.utils import simple_preprocess from mimp import readdata import logging dfxls = readdata(sourcefile='Mailgegevens.xlsx') dfxls = dfxls.groupby(['conversationID'])['tekst'].first() tekst = dfxls.tolist() def read_input(input_file): """This method reads the input file which is in gzip format""" logging.info("reading file {0}...this may take a while".format(input_file)) for i, line in enumerate(input_file): if (i % 10000 == 0): logging.info("read {0} reviews".format(i)) # do some pre-processing and return list of words for each review # text if not (type(line) == float and line != line): yield simple_preprocess(line) documents = list(read_input(tekst)) model = Word2Vec(documents, size=200, window=10, min_count=5, workers=10) model.wv.save('word2vec_mail.model')
# -*- coding: utf-8 -*- """ Created on Tue Nov 20 19:25:31 2018 @author: Kraan """ from mimp import readdata from pandas import pd dfxls = readdata() #read data s1 = dfxls.groupby(['conversationID'], sort=False)['Verzenddatum'].max( ) #get the date of the last mail in the conversation s2 = dfxls.groupby( ['conversationID'], sort=False)['Verzenddatum'].min() #get date of first mail in conversation s = pd.concat( [s1.rename('max_date'), s2.rename('min_date')], axis=1) #add two dates together and change columnnames dfxls_new = pd.merge(dfxls, s, left_on='conversationID', right_index=True, how='left') #add dates to original data #create new column to identify first mail of outbound conversation dfxls_new['first_mail'] = (dfxls_new['Verzenddatum'] == dfxls_new['min_date']) & (dfxls_new['Inkomend'] == True)
def getmaildata(sourcefile=Maildata): dfxls = readdata(sourcefile=sourcefile) return dfxls