示例#1
0
def getcleantext(sourcefile=Maildata, splitsents=False, min_v=10, max_v=25000):
    filename = textfilename
    dfxls = readdata(sourcefile=sourcefile)
    mailtext = getstrings(filename=filename, dataframe=dfxls)
    mailtext = mailtext.replace('\n', ' ')
    mailtext = ReplaceTokens(mailtext)
    mailwords = []
    if splitsents is True:
        mailwords = removewords(inputtext=mailtext, min_v=min_v, max_v=max_v, stopword=stopword)
    return(dfxls, mailtext, mailwords)
示例#2
0
def get_data(source):
    df = readdata(sourcefile=source)
    # dataframe met alleen de verzonden mails
    df = df[df.status == 1]
    df = df.groupby(['conversationID'])['tekst'].first()
    return df.tolist()
示例#3
0
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from mimp import readdata
import logging

dfxls = readdata(sourcefile='Mailgegevens.xlsx')
dfxls = dfxls.groupby(['conversationID'])['tekst'].first()
tekst = dfxls.tolist()


def read_input(input_file):
    """This method reads the input file which is in gzip format"""
    logging.info("reading file {0}...this may take a while".format(input_file))

    for i, line in enumerate(input_file):
        if (i % 10000 == 0):
            logging.info("read {0} reviews".format(i))
        # do some pre-processing and return list of words for each review
        # text
        if not (type(line) == float and line != line):
            yield simple_preprocess(line)


documents = list(read_input(tekst))

model = Word2Vec(documents, size=200, window=10, min_count=5, workers=10)

model.wv.save('word2vec_mail.model')
示例#4
0
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 20 19:25:31 2018

@author: Kraan
"""

from mimp import readdata
from pandas import pd

dfxls = readdata()  #read data

s1 = dfxls.groupby(['conversationID'], sort=False)['Verzenddatum'].max(
)  #get the date of the last mail in the conversation
s2 = dfxls.groupby(
    ['conversationID'],
    sort=False)['Verzenddatum'].min()  #get date of first mail in conversation
s = pd.concat(
    [s1.rename('max_date'), s2.rename('min_date')],
    axis=1)  #add two dates together and change columnnames

dfxls_new = pd.merge(dfxls,
                     s,
                     left_on='conversationID',
                     right_index=True,
                     how='left')  #add dates to original data

#create new column to identify first mail of outbound conversation
dfxls_new['first_mail'] = (dfxls_new['Verzenddatum']
                           == dfxls_new['min_date']) & (dfxls_new['Inkomend']
                                                        == True)
示例#5
0
def getmaildata(sourcefile=Maildata):
    dfxls = readdata(sourcefile=sourcefile)
    return dfxls