Exemplo n.º 1
0
def main():
    os.chdir("./dataset")

    # Set default encoding of python to utf8
    reload(sys)
    sys.setdefaultencoding('utf8')

    ngrams_set = set()

    with open('output.csv', 'w') as output:
        csvwriter = csv.writer(output)
        csvwriter.writerow(["Data File", "Narrative", "N-grams"])
        for file in glob.glob("*.txt"):
            with open(file, 'r') as reader:
                text = ''
                for line in reader:
                    text = text + line.rstrip('\n\r').lower()

                print("\nProcessing " + file)
                # Tokenization
                tokens = get_tokens(text)

                # Part of speech Tagging
                tag_set = get_pos_tags(tokens)

                # Technical N-gram extraction
                ngrams = get_tech_ngrams(text, tag_set)

                # Write to output
                csvwriter.writerow([file, text, ngrams.keys()])

                ngrams_set = ngrams_set.union(set(ngrams.keys()))

    ngrams_list = lexicon_expansion(list(ngrams_set))

    with open('ngrams.txt', 'w') as writer:
        for s in ngrams_list:
            writer.write(str(s) + '\n')
Exemplo n.º 2
0
def get_features(review, polarity):
    features = {}
    uniqueWords = 0
    personalRatio = 0
    personal = 0
    misspelt = 0
    hotelName = 0
    personalPronouns = ["i", "me", "we", "our", "ours", "mine"]
    sentences = sent_tokenize(review)
    sent = nltk.word_tokenize(review)

    s = len(sentences)
    wordsR = regexp_tokenize(review, "\w+")
    for x in wordsR:
        if x in personalPronouns:
            personal += 1
        #if x not in set(words.words()):
        #misspelt+=1
        if x in hotels:
            hotelName += 1
    w = len(wordsR)
    unique = len(set(wordsR))
    uniqueWords += unique
    review = review.replace(" ", "")
    c = len(review)
    cap = 0
    features['dollar'] = False
    for i in range(len(review)):
        if review[i].isupper:
            cap += 1
        if review[i] == '$':
            features['dollar'] = True
    ari = 4.71 * (float(c) / w) + 0.5 * (float(w) / s) - 21.43
    capRatio = c / float(s)
    personalRatio += float(personal) / w
    features['uniqueWords'] = uniqueWords
    features['personalRatio'] = personalRatio
    features['ari'] = ari
    features['capRatio'] = capRatio
    features['polarity'] = polarity
    features['hotel'] = hotelName
    ngrams = get_bigrams(review, 'x')
    sentiments = get_sentimentFeatures(review, 'x')
    for x in ngrams.keys():
        features[x] = ngrams[x]
    for x in sentiments.keys():
        features[x] = sentiments[x]
    features['misspelt'] = misspelt
    return features
Exemplo n.º 3
0
def get_features(review,polarity):
    features = {}
    uniqueWords = 0
    personalRatio = 0
    personal = 0
    misspelt = 0
    hotelName = 0
    personalPronouns = ["i","me","we","our","ours","mine"]
    sentences = sent_tokenize(review)
    sent = nltk.word_tokenize(review)

    s = len(sentences)
    wordsR = regexp_tokenize(review,"\w+")
    for x in wordsR:
        if x in personalPronouns:
            personal+=1
        #if x not in set(words.words()):
            #misspelt+=1
        if x in hotels:
            hotelName+=1
    w = len(wordsR)
    unique = len(set(wordsR))
    uniqueWords+=unique
    review = review.replace(" ","")
    c = len(review)
    cap = 0
    features['dollar'] = False
    for i in range(len(review)):
        if review[i].isupper:
            cap+=1
        if review[i] == '$':
            features['dollar'] = True
    ari =4.71*(float(c)/w)+0.5*(float(w)/s)-21.43
    capRatio = c/float(s)
    personalRatio += float(personal)/w
    features['uniqueWords'] = uniqueWords
    features['personalRatio'] = personalRatio
    features['ari'] = ari
    features['capRatio'] = capRatio
    features['polarity'] = polarity
    features['hotel'] = hotelName
    ngrams = get_bigrams(review,'x')
    sentiments = get_sentimentFeatures(review,'x')
    for x in ngrams.keys():
        features[x] = ngrams[x]
    for x in sentiments.keys():
        features[x] = sentiments[x]
    features['misspelt'] = misspelt
    return features
Exemplo n.º 4
0
def GenerateNgrams(text, N):

    #N = N - 1

    # Store words (slices)
    words = []
    # Store ngrams sets
    ngrams = {}

    # Iterate over the text rows
    for i in range(text.shape[0]):

        word = ""

        # Create a word (set of numbers separated by ','), we need to do that
        # because each line (array) of the matrix is a word not a sentence.
        # Previously we treated each array of the matrix as a sentence and the results were horrible:
        # e.g. "1-5.csv" (will be adjunt)
        for j in text[i]:
            word = word + str(j) + ","

        # Remove the last ',' because it is not needed
        word = word[:-1]
        # Save all the words
        words.append(word)

    if N - 1 > 0:
        # Create all the ngrams sets
        for i in range(len(words) - (N - 1)):

            sequence = ' '.join(words[i:i + (N - 1)])

            if sequence not in ngrams.keys():
                ngrams[sequence] = []

            ngrams[sequence].append(words[i + (N - 1)])

    if DEPURATION:
        print("Words: " + str(words) + "\n")
        print("Ngramas: " + str(ngrams) + "\n")
        print()

    return ngrams, words
Exemplo n.º 5
0
def main():
    dopretests = True
    try:
        tests = sys.argv[1]
        if tests[0] == 'x':
            dopretests = False
            tests = tests[1:]
        if '-' in tests:
            begintest = int(tests.split('-')[0])
            endtest = int(tests.split('-')[1])
        else:
            begintest = endtest = int(tests)
    except:
        print("Specify a text file (plain text, UTF-8, one sentence per line, preferably tokenised) to use as a basis",file=sys.stderr)
        sys.exit(2)
    try:
        textfile = sys.argv[2]
    except:
        print("Specify a text file (plain text, UTF-8, one sentence per line, preferably tokenised) to use as a basis",file=sys.stderr)
        sys.exit(2)

    try:
        tmpdir = sys.argv[3]
    except:
        tmpdir = "/tmp/"


    classfile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.cls'
    datafile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.dat'
    modelfile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.patternmodel'


    if not os.path.exists(textfile):
        print("File does not exist",file=sys.stderr)
        sys.exit(2)

    if dopretests:
        linecount = 0
        print("PRETEST #1 - Reading text file (Python)")
        b = begin()
        with open(textfile,'r',encoding='utf-8') as f:
            for line in f:
                linecount += 1
        end(b)
        print("\t(Read " + str(linecount) + " lines)")


        print("PRETEST #2 - Building class encoder")
        encoder = colibricore.ClassEncoder()
        b = begin()
        encoder.build(textfile)
        end(b)


        print("PRETEST #3 - Saving class encoder")
        b = begin()
        encoder.save(classfile)
        end(b)


        print("PRETEST #4 - Class encoding corpus")
        b = begin()
        encoder.encodefile(textfile, datafile)
        end(b)

        print("PRETEST #5 - Unloading encoder")
        b = begin()
        del encoder
        gc.collect()
        end(b)


    if begintest < endtest:
        print("Running tests " , begintest, " to ", endtest)
        for testnum in range(begintest, min(endtest+1,10)):
            os.system("python3 " + sys.argv[0] + " x" + str(testnum) + " "+ textfile + " " + tmpdir)

    else:
        testnum = begintest
        print("-------------------- " + colorf('bold','TEST') + " #" + str(testnum) +" ----------------------")
        if testnum == 1:

            linecount = 0
            print("Extracting and counting n-grams (up to 8-grams,threshold=1) naively (Python defaultdict + Pynlpl MultiWindower)")
            ngrams=defaultdict(int)
            b = begin()
            with open(textfile,'r',encoding='utf-8') as f:
                for line in f:
                    for ngram in MultiWindower(line, 1,8):
                        ngrams[ngram] += 1
            end(b)
            print("\t(Found " + str(len(ngrams)) + " ngrams)")

        elif testnum == 2:
            print("Extracting and counting n-grams (up to 8-grams,threshold=1) naively with NLTK (nltk.FreqDist + nltk.util.ngrams)")

            from nltk.probability import FreqDist
            from nltk.util import ngrams

            fd=FreqDist()
            b = begin()
            with open(textfile,'r',encoding='utf-8') as f:
                for line in f:
                    tokens = line.split(' ')
                    for n in range(1,9):
                        for ngram in ngrams(tokens, n):
                            fd[ngram] += 1
            end(b)
            print("\t(Done)")
        elif testnum == 3:

            print("Extracting and counting ALL n-grams (up to 8-grams, threshold=1) with UnindexedPatternModel")
            model = colibricore.UnindexedPatternModel()
            options = colibricore.PatternModelOptions(mintokens=1,maxlength=8,doreverseindex=False)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model,modelfile)
            del model

        if testnum == 4:
            linecount = 0
            print("Extracting and counting n-grams (up to 8-grams, threshold=2, with look-back)  (Python defaultdict + Pynlpl Windower)")
            ngrams=defaultdict(int)
            b = begin()
            for n in range(1,9):
                with open(textfile,'r',encoding='utf-8') as f:
                    for line in f:
                        for ngram in Windower(line, n):
                            docount = True
                            if n>1:
                                for subngram in Windower(ngram,n-1):
                                    if not subngram in ngrams:
                                        docount = False
                                        break
                            if docount:
                                ngrams[ngram] += 1
            end(b)
            print("\t(Found " + str(len(ngrams)) + " ngrams)")
        if testnum == 5:
            linecount = 0
            print("Extracting and counting n-grams (up to 8-grams, threshold=2, without look-back)  (Python defaultdict + Pynlpl Windower)")
            ngrams=defaultdict(int)
            b = begin()
            with open(textfile,'r',encoding='utf-8') as f:
                for line in f:
                    for ngram in MultiWindower(line, 1,8):
                        ngrams[ngram] += 1
            for ngram in list(ngrams.keys()):
                if ngrams[ngram] < 2: del ngrams[ngram]
            gc.collect()
            end(b)
            print("\t(Found " + str(len(ngrams)) + " ngrams)")
        elif testnum == 6:

            print("Extracting and counting ALL n-grams (up to 8-grams, threshold=2) with UnindexedPatternModel")
            model = colibricore.UnindexedPatternModel()
            options = colibricore.PatternModelOptions(mintokens=2,maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model,modelfile)

        elif testnum == 7:

            print("Extracting and counting ALL n-grams (up to 8-grams,threshold=1) with UnindexedPatternModel (with preloaded corpus)")
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.UnindexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=1,maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model,modelfile)

        elif testnum == 8:

            print("Extracting and counting ALL n-grams (up to 8-grams,threshold=1) with IndexedPatternModel (with preloaded corpus)")
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.IndexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=1,maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model,modelfile)

            del model

        elif testnum == 9:
            print("Extracting and counting n-grams with treshold 2 (up to 8-grams) with IndexedPatternModel (with preloaded corpus)")
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.IndexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=2,maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model,modelfile)

        elif testnum == 10:

            print("Extracting and counting n-grams and skipgrams with treshold 2 (up to 8-grams) with IndexedPatternModel (with preloaded corpus)")
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.IndexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=2,maxlength=8, doskipgrams=True)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model,modelfile)

        elif testnum == 11:
            print("Extracting and counting ALL n-grams (up to 8-grams, threshold=1) with OrderedUnindexedPatternModel")
            model = colibricore.OrderedUnindexedPatternModel()
            options = colibricore.PatternModelOptions(mintokens=1,maxlength=8,doreverseindex=False)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model,modelfile)
            del model


        else:
            print("No such test",file=sys.stderr)
        print()
Exemplo n.º 6
0
def GenerateText(ngrams, words, N, width):

    if N > 1:
        # The first N words of the text i.e. ngram
        currentSequence = ' '.join(words[0:(N - 1)])
    else:
        currentSequence = words[0]

    # This variable is used for continuing the generation of the text when we aren't able to generate the next word
    firstSequence = currentSequence
    if DEPURATION:
        print("First sequence: " + str(firstSequence))

    # Append the first sequence to the output text
    output = currentSequence

    # Iterate to create the new words (vertical slices) that are part of the map
    i = 0
    while (i < width):
        #print("N: ")
        #print(N-1)
        if (N - 1) > 0:
            if currentSequence not in ngrams.keys():
                currentSequence = firstSequence
                possibleWords = ngrams[currentSequence]
                nextWord = possibleWords[random.randrange(len(possibleWords))]
                output += ' ' + currentSequence + ' ' + nextWord

                if DEPURATION:
                    print("Output reset:" + str(output))
                    print()

                i += N

            else:
                print(currentSequence)

                possibleWords = ngrams[currentSequence]

                nextWord = possibleWords[random.randrange(len(possibleWords))]

                if DEPURATION:
                    print("Possible words: " + str(possibleWords))
                    print("Next word: " + str(nextWord))
                    print()

                output += ' ' + nextWord

                #i += 1

            wordsSequence = output
            auxSequence = list(wordsSequence.split(' '))
            currentSequence = auxSequence[len(auxSequence) -
                                          (N - 1):len(auxSequence)]
            currentSequence = ' '.join(currentSequence)

            if DEPURATION:
                print("Next step sequence: " + str(currentSequence))
                print()
        else:
            nextWord = words[random.randrange(len(words))]
            output += ' ' + nextWord

        i += 1

    output = list(output.split(' '))

    return output
Exemplo n.º 7
0
def main():
    dopretests = True
    try:
        tests = sys.argv[1]
        if tests[0] == 'x':
            dopretests = False
            tests = tests[1:]
        if '-' in tests:
            begintest = int(tests.split('-')[0])
            endtest = int(tests.split('-')[1])
        else:
            begintest = endtest = int(tests)
    except:
        print(
            "Specify a text file (plain text, UTF-8, one sentence per line, preferably tokenised) to use as a basis",
            file=sys.stderr)
        sys.exit(2)
    try:
        textfile = sys.argv[2]
    except:
        print(
            "Specify a text file (plain text, UTF-8, one sentence per line, preferably tokenised) to use as a basis",
            file=sys.stderr)
        sys.exit(2)

    try:
        tmpdir = sys.argv[3]
    except:
        tmpdir = "/tmp/"

    classfile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.cls'
    datafile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.dat'
    modelfile = tmpdir + "/" + os.path.basename(
        textfile) + '.colibri.patternmodel'

    if not os.path.exists(textfile):
        print("File does not exist", file=sys.stderr)
        sys.exit(2)

    if dopretests:
        linecount = 0
        print("PRETEST #1 - Reading text file (Python)")
        b = begin()
        with open(textfile, 'r', encoding='utf-8') as f:
            for line in f:
                linecount += 1
        end(b)
        print("\t(Read " + str(linecount) + " lines)")

        print("PRETEST #2 - Building class encoder")
        encoder = colibricore.ClassEncoder()
        b = begin()
        encoder.build(textfile)
        end(b)

        print("PRETEST #3 - Saving class encoder")
        b = begin()
        encoder.save(classfile)
        end(b)

        print("PRETEST #4 - Class encoding corpus")
        b = begin()
        encoder.encodefile(textfile, datafile)
        end(b)

        print("PRETEST #5 - Unloading encoder")
        b = begin()
        del encoder
        gc.collect()
        end(b)

    if begintest < endtest:
        print("Running tests ", begintest, " to ", endtest)
        for testnum in range(begintest, min(endtest + 1, 10)):
            os.system("python3 " + sys.argv[0] + " x" + str(testnum) + " " +
                      textfile + " " + tmpdir)

    else:
        testnum = begintest
        print("-------------------- " + colorf('bold', 'TEST') + " #" +
              str(testnum) + " ----------------------")
        if testnum == 1:

            linecount = 0
            print(
                "Extracting and counting n-grams (up to 8-grams,threshold=1) naively (Python defaultdict + Pynlpl MultiWindower)"
            )
            ngrams = defaultdict(int)
            b = begin()
            with open(textfile, 'r', encoding='utf-8') as f:
                for line in f:
                    for ngram in MultiWindower(line, 1, 8):
                        ngrams[ngram] += 1
            end(b)
            print("\t(Found " + str(len(ngrams)) + " ngrams)")

        elif testnum == 2:
            print(
                "Extracting and counting n-grams (up to 8-grams,threshold=1) naively with NLTK (nltk.FreqDist + nltk.util.ngrams)"
            )

            from nltk.probability import FreqDist
            from nltk.util import ngrams

            fd = FreqDist()
            b = begin()
            with open(textfile, 'r', encoding='utf-8') as f:
                for line in f:
                    tokens = line.split(' ')
                    for n in range(1, 9):
                        for ngram in ngrams(tokens, n):
                            fd[ngram] += 1
            end(b)
            print("\t(Done)")
        elif testnum == 3:

            print(
                "Extracting and counting ALL n-grams (up to 8-grams, threshold=1) with UnindexedPatternModel"
            )
            model = colibricore.UnindexedPatternModel()
            options = colibricore.PatternModelOptions(mintokens=1,
                                                      maxlength=8,
                                                      doreverseindex=False)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)
            del model

        if testnum == 4:
            linecount = 0
            print(
                "Extracting and counting n-grams (up to 8-grams, threshold=2, with look-back)  (Python defaultdict + Pynlpl Windower)"
            )
            ngrams = defaultdict(int)
            b = begin()
            for n in range(1, 9):
                with open(textfile, 'r', encoding='utf-8') as f:
                    for line in f:
                        for ngram in Windower(line, n):
                            docount = True
                            if n > 1:
                                for subngram in Windower(ngram, n - 1):
                                    if not subngram in ngrams:
                                        docount = False
                                        break
                            if docount:
                                ngrams[ngram] += 1
            end(b)
            print("\t(Found " + str(len(ngrams)) + " ngrams)")
        if testnum == 5:
            linecount = 0
            print(
                "Extracting and counting n-grams (up to 8-grams, threshold=2, without look-back)  (Python defaultdict + Pynlpl Windower)"
            )
            ngrams = defaultdict(int)
            b = begin()
            with open(textfile, 'r', encoding='utf-8') as f:
                for line in f:
                    for ngram in MultiWindower(line, 1, 8):
                        ngrams[ngram] += 1
            for ngram in list(ngrams.keys()):
                if ngrams[ngram] < 2: del ngrams[ngram]
            gc.collect()
            end(b)
            print("\t(Found " + str(len(ngrams)) + " ngrams)")
        elif testnum == 6:

            print(
                "Extracting and counting ALL n-grams (up to 8-grams, threshold=2) with UnindexedPatternModel"
            )
            model = colibricore.UnindexedPatternModel()
            options = colibricore.PatternModelOptions(mintokens=2, maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

        elif testnum == 7:

            print(
                "Extracting and counting ALL n-grams (up to 8-grams,threshold=1) with UnindexedPatternModel (with preloaded corpus)"
            )
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.UnindexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=1, maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

        elif testnum == 8:

            print(
                "Extracting and counting ALL n-grams (up to 8-grams,threshold=1) with IndexedPatternModel (with preloaded corpus)"
            )
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.IndexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=1, maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

            del model

        elif testnum == 9:
            print(
                "Extracting and counting n-grams with treshold 2 (up to 8-grams) with IndexedPatternModel (with preloaded corpus)"
            )
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.IndexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=2, maxlength=8)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

        elif testnum == 10:

            print(
                "Extracting and counting n-grams and skipgrams with treshold 2 (up to 8-grams) with IndexedPatternModel (with preloaded corpus)"
            )
            corpus = colibricore.IndexedCorpus(datafile)
            model = colibricore.IndexedPatternModel(reverseindex=corpus)
            options = colibricore.PatternModelOptions(mintokens=2,
                                                      maxlength=8,
                                                      doskipgrams=True)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)

        elif testnum == 11:
            print(
                "Extracting and counting ALL n-grams (up to 8-grams, threshold=1) with OrderedUnindexedPatternModel"
            )
            model = colibricore.OrderedUnindexedPatternModel()
            options = colibricore.PatternModelOptions(mintokens=1,
                                                      maxlength=8,
                                                      doreverseindex=False)
            b = begin()
            model.train(datafile, options)
            end(b)
            savemodel(model, modelfile)
            del model

        else:
            print("No such test", file=sys.stderr)
        print()
Exemplo n.º 8
0
    with open("./{}_no_clickbait_big.pickle".format(name), "rb") as f:
        ngrams_no_clickbait[key] = pickle.load(f)

# normalize the counts
# ---> count of each ngram / total occurrences of ngram

# select top 0.0005 of each (uni, bi, tri, 4), for C and NC separately

# loop over the dataset again, separately for C and NC rows
# for C data -> count how many posts contain top C list ngrams
# for NC data -> count how many posts contain top NC list ngrams

clickbait_final_list = []
for n, ngrams in ngrams_clickbait.items():
    normalizer = sum(ngrams.values())
    for ngram in ngrams.keys():
        ngrams[ngram] /= normalizer

    ngrams_clickbait[n] = ngrams.most_common(int(len(ngrams.keys()) * 0.005))
    clickbait_final_list += [elem[0] for elem in ngrams_clickbait[n]]

clickbait_final_list = set(clickbait_final_list)

no_clickbait_final_list = []
for n, ngrams in ngrams_no_clickbait.items():
    normalizer = sum(ngrams.values())
    for ngram in ngrams.keys():
        ngrams[ngram] /= normalizer

    ngrams_no_clickbait[n] = ngrams.most_common(int(
        len(ngrams.keys()) * 0.005))