Пример #1
0
def lemmatize(dataset):
    try:
        for i in range(0, len(dataset)):
            review = re.sub("[^a-zA-Z]", ' ', dataset['text'][i])
            review = review.lower()
            review = review.split()

            # Learn More at https://www.quora.com/What-is-difference-between-stemming-and-lemmatization
            ps = PorterStemmer()
            nl = WordNetLemmatizer()

            review = [
                ps.stem(nl.lemmatize(word, pos='v')) for word in review
                if not word in set(stopwords.words('english'))
            ]
            review = list(set(review))
            S1_corpus.append(review)
            bar.load(i, base=dataset, text='Stream 1')
        print('Stream 1: Processed')
        stream1 = pd.Series(S1_corpus)
        return stream1
    except KeyboardInterrupt:
        print("[STAGE 1] Terminating. Human Intervention Not Allowed")
        exit(0)
    except AttributeError as e:
        print("[STAGE 1] Terminating. Due to ", e)
        exit(0)
Пример #2
0
def dep_rel(dataset):
    try:
        corpora = ''
        for increment in range(len(dataset)):
            sentence = dataset.iloc[increment, 0].lower()
            # print(increment)
            for token in nlp_en(sentence):
                dep = check_dep_parse(token.dep_)
                if dep is True:
                    # print(token.dep_, end="> ")
                    # print(token.head, token)
                    corpora += str(token) + ' ' + str(token.head) + ';'
                else:
                    pass
            S3_dep_corpus.append(corpora)
            corpora = ''
            bar.load(increment, base=dataset, text='Stream 3')
        print('Stream 3: Processed')
        plot_nlp = nlp_en(sentence)
        stream3 = pd.Series(S3_dep_corpus)
        return stream3
    except TypeError as e:
        print("[STAGE 2] Unexpected Termination:", e)
        exit(0)
    except KeyboardInterrupt:
        print("[STAGE 2] Human Interrupt Received! Exiting...")
        exit(0)
Пример #3
0
def bigram(dataset):
    try:
        corpora = ''
        for i in range(len(dataset)):
            sent = nltk.word_tokenize(dataset.iloc[i, 0].lower())
            PoS_Tag_sent = nltk.pos_tag(sent)

            for (w1, tag1), (w2, tag2) in nltk.bigrams(PoS_Tag_sent):
                if tag1.startswith('JJ') and tag2.startswith('NN'):  # R1
                    corpora += w1 + ' ' + w2 + ';'
                elif tag1.startswith('RB') and tag2.startswith('JJ'):  # R2
                    corpora += w1 + ' ' + w2 + ';'
                elif tag1.startswith('JJ') and tag2.startswith('JJ'):  # R3
                    corpora += w1 + ' ' + w2 + ';'
                elif tag1.startswith('NN') and tag2.startswith('JJ'):  # R4
                    corpora += w1 + ' ' + w2 + ';'
                elif tag1.startswith('RB') and tag2.startswith('VB'):  # R5
                    corpora += w1 + ' ' + w2 + ';'
                elif tag1.startswith('VB') and tag2.startswith('NN'):  # R6
                    corpora += w1 + ' ' + w2 + ';'
                elif tag1.startswith('JJ') and tag2.startswith('VB'):  # R7
                    corpora += w1 + ' ' + w2 + ';'
                elif tag1.startswith('RB') and tag2.startswith('RB'):  # R8
                    corpora += w1 + ' ' + w2 + ';'
                elif tag1.startswith('RB') and tag2.startswith('VB'):  # R9
                    corpora += w1 + ' ' + w2 + ';'

            S2_super_corpus.append(corpora)
            corpora = ''
            bar.load(i, base=dataset, text='Stream 2')
        print('Stream 2: Processed')
        stream2 = pd.Series(S2_super_corpus)
        return stream2
    except KeyboardInterrupt:
        print("[STAGE 1] Terminating. Human Intervention Not Allowed")
        exit(0)
    except AttributeError as e:
        print("[STAGE 1] Terminating due to", e)
        exit(0)
Пример #4
0
def syns_of_ngrams(ngram_list):
    syns_book = list()
    syns = list()
    low_synonyms = list()

    for i in range(len(ngram_list)):
        one_review = ngram_list[i]
        for word in one_review:
            syns = list()
            low_synonyms = list()
            for synonyms in wordnet.synsets(word):
                # print(synonyms.lemma_names())
                syns += synonyms.lemma_names()
            syns = list(set(syns))
            # print(syns)
            for j in range(len(syns)):
                # print(syns[j].lower())
                low_synonyms.append(syns[j].lower())
                bar.load(j, base=syns, text='Generating Synonyms')
            ' '.join(low_synonyms)
        syns_book.append(low_synonyms)

    return syns_book
Пример #5
0
    for i in range(0, len(dataset)):
        review = re.sub("[^a-zA-Z]", ' ', dataset['text'][i])
        review = review.lower()
        review = review.split()

        # Learn More at https://www.quora.com/What-is-difference-between-stemming-and-lemmatization
        ps = PorterStemmer()
        nl = WordNetLemmatizer()

        review = [
            ps.stem(nl.lemmatize(word, pos='v')) for word in review
            if not word in set(stopwords.words('english'))
        ]
        review = list(set(review))
        S1_corpus.append(review)
        bar.load(i, base=dataset, text='Stream 1')
    print('Stream 1: Processed')
    # print(S1_corpus)
    # ----------------------------------------------------------- STREAM 2 - BIGRAMS

    for i in range(len(dataset)):
        sent = nltk.word_tokenize(dataset.iloc[i, 0].lower())
        PoS_Tag_sent = nltk.pos_tag(sent)

        for (w1, tag1), (w2, tag2) in nltk.bigrams(PoS_Tag_sent):
            if tag1.startswith('JJ') and tag2.startswith('NN'):  # R1
                corpora += w1 + ' ' + w2 + ';'
            elif tag1.startswith('RB') and tag2.startswith('JJ'):  # R2
                corpora += w1 + ' ' + w2 + ';'
            elif tag1.startswith('JJ') and tag2.startswith('JJ'):  # R3
                corpora += w1 + ' ' + w2 + ';'