def loadPreprocessedData(filename):
    inputData = preprocessing.loadFromFile(filename)
    dataExamples = preprocessing.generateExamples(inputData)      
    emojis = preprocessing.convertEmojisToText(dataExamples)
    emoji = preprocessing.convertSymbolEmojisToText(emojis)
    transformedData = preprocessing.removePunctuation(emoji)
    withoutStopWords = preprocessing.removeStopWords(transformedData)    
    slanglessData = preprocessing.translateSlangWords(withoutStopWords)
    data = preprocessing.removeWhiteSpace(slanglessData)
    return data
Пример #2
0
def getdgist():
    fo = open("news.txt", "r+")
    example_text = fo.read()
    sentences = sent_tokenize(example_text)

    fo = open("news_title.txt", "r+")
    title = fo.read()

    sentencearray = []
    sent_tfidf = []
    sent_title = []
    sent_position = []
    sent_len = []
    test_data = []

    example_array = word_tokenize(example_text)
    print("Length of Original text ->", len(example_array))
    print()

    for each_sent in sentences:
        each_sent = removePunctuations(each_sent.lower())
        sent_words = word_tokenize(each_sent)
        sent_words = removeStopWords(sent_words)
        sent_words = steemer(sent_words)
        sentence = ' '.join(sent_words)
        sentencearray.append(sentence)

    tf_pickle = open("tf.pickle", "rb")
    vect = pickle.load(tf_pickle)

    test_dtm = vect.transform(sentencearray)

    tfidf_pickle = open("tfidf.pickle", "rb")
    tfidf = pickle.load(tfidf_pickle)

    tf_idf_matrix = tfidf.transform(test_dtm)

    TF = tf_idf_matrix.astype(np.float32)

    Tf_Idf = TF.toarray()

    for index, each_TfIdf in enumerate(Tf_Idf):
        l = len(word_tokenize(sentencearray[index]))
        summ = 0
        if l != 0:
            for i in each_TfIdf:
                summ += i
                summ = summ / l

    sent_tfidf.append(summ)

    Title = removePunctuations(title)
    titleWords = word_tokenize(Title)
    titleWords = removeStopWords(titleWords)
    titleWords = steemer(titleWords)

    print("Length of titlewords ->", len(titleWords))
    print(titleWords)

    for each_sent in sentencearray:
        sentenceWords = word_tokenize(each_sent)
        sent_title.append(TitleScore(titleWords, sentenceWords))

    length = len(sentencearray)
    for index, each_sent in enumerate(sentencearray):
        sent_position.append(SentencePositionScore(index + 1, length))

    max_l = []
    for each_sent in sentencearray:
        max_l.append(len(word_tokenize(each_sent)))

        max_length = max(max_l)

    for i in range(len(sentencearray)):
        sent_len.append(SentenceLengthScore(sentencearray[i], max_length))

    for i, j, k, l in zip(sent_tfidf, sent_title, sent_position, sent_len):
        test_data.append([i, j, k, l])

    classifier_pickle = open("clf.pickle", "rb")
    clf = pickle.load(classifier_pickle)
    # print("sentencearray ->",len(sentencearray))
    count = 0
    for i in sentence:
        count = count + len(i)
    print("Length of Summary ->", count)
    print(sentencearray)

    prediction = clf.predict(test_data)
    print(prediction)
Пример #3
0
f=open('../data/positive_sample.csv','r')
fo=open('../data/positive_processed.csv','w')
import preprocessing
stopWords = preprocessing.getStopWordList('../resources/stopWords.txt')
slangs = preprocessing.loadSlangs('../resources/internetSlangs.txt')

f=open('../data/positive_sample.csv','r')
fo=open('../data/positive_processed.csv','w')

line=f.readline()
while line:
    a=line.split(r'","')
    b=a[5][:-1]
    c=preprocessing.processTweet(b,stopWords,slangs)
    
    d=preprocessing.removeStopWords(c,stopWords)
    
    fo.write(d+'\n')
    line = f.readline()

f.close()
fo.close()

print "positive samples processed"

f=open('../data/negative_sample.csv','r')
fo=open('../data/negative_processed.csv','w')
line=f.readline()
while line:
    a=line.split(r'","')
    b=a[5][:-1]
Пример #4
0
import preprocessing
stopWords = preprocessing.getStopWordList('../resources/stopWords.txt')
slangs = preprocessing.loadSlangs('../resources/internetSlangs.txt')

f = open('../data/sts/positive_sample.csv', 'r')
fo = open('../data/positive_processed.csv', 'w')

line = f.readline()
while line:
    a = line.split(r'","')
    b = a[5][:-1]
    c = preprocessing.processTweet(b, stopWords, slangs)

    d = preprocessing.removeStopWords(c, stopWords)

    fo.write(d + '\n')
    line = f.readline()

f.close()
fo.close()

print("positive samples processed")

f = open('../data/sts/negative_sample.csv', 'r')
fo = open('../data/negative_processed.csv', 'w')
line = f.readline()
while line:
    a = line.split(r'","')
    b = a[5][:-1]
    c = preprocessing.processTweet(b, stopWords, slangs)
    d = preprocessing.removeStopWords(c, stopWords)
Пример #5
0
def test_removeStopWords():
    tokensToFilter = ["In", "this", "text", "stop", "words", "have", "to", "be", "removed", "In", "each", "phrase", "occurances", "of", "words", "such", "as", "this", "that", "so", "and", "and", "of", "will", "be", "removed"]
    filteredTokens = ["In", "text", "stop", "words", "removed", "In", "phrase", "occurances", "words", "removed"]
    tokensToTest = preprocessing.removeStopWords(tokensToFilter)
    for a, b in zip(filteredTokens, tokensToTest):
        assert(a == b)