Пример #1
0
def tag_tokens_using_stanford_corenlp(token_list, corenlp_server_address='http://localhost:9000'):
    # print("tag_tokens_using_stanford_corenlp started")

    tagger = CoreNLPPOSTagger(url=corenlp_server_address)

    # The piece of code below is exists to deal with a limitation of the Stanford's coreNLP Server that only
    # supports 100000 characters per server call. So this will break the text in a lot of smaller pieces and send
    # them to the server and after will unite them all in one list of tagged words ('tagged_text')
    tagged_text = []
    txt_size = len(token_list)
    i = 0
    while i < txt_size:

        if i + 6000 >= txt_size:
            tokens_to_tag = token_list[i:txt_size]
            i = txt_size
        else:
            tokens_to_tag = token_list[i:i + 6000]
            i += 6000

        tagged_text += tagger.tag(tokens_to_tag)

    # print("tag_tokens_using_stanford_corenlp ended")

    return tagged_text
    def __init__(self, config_path=DEFAULT_CONFIG_PATH):
        self.config = load_config(config_path)
        corenlp_config = self.config["data"]["stanford_corenlp"]
        self.tagger = CoreNLPPOSTagger(
            url="http://%s:%d" %
            (corenlp_config["host"], corenlp_config["port"]))

        self.pos_map = self.config["model"]["STANFORD_POS_MAP"]
def get_pos_tag(techs, words):
    """ Get POS tag of words.

        ([str], [str]) -> ([str], [str])
    """
    tags = []
    flag = False
    tagged_words = CoreNLPPOSTagger(url='http://localhost:9000').tag(words)
    if len(words) != len(tagged_words):
        tagged_words = pos_tag(words)
    words = []
    for (word, tag) in tagged_words:
        if flag:
            word = "." + word
            flag = False
        if tag == "IN" and word in cin:
            tags.append("CIN")
        elif word in cv:
            tags.append("CV")
        elif word in techs:
            tags.append("TECH")
        elif word == ".":
            flag = True
            continue
        elif tag[:2] == "VB":
            tags.append("VB")
        else:
            tags.append(tag)
        words.append(word)
    return (words, tags)
Пример #4
0
def count(file_name):
    with open(os.path.join(os.path.pardir, "out", "tech_v6",
                           file_name)) as data3_file:
        num = 0
        for line in data3_file:
            if num % 4 == 2:
                words = line.split(" ")
                words[-1] = words[-1].strip()
                for (word, tag) in CoreNLPPOSTagger(
                        url='http://localhost:9000').tag(words):
                    if word not in stopwords_en and word not in modal_verbs and word not in synonyms:
                        if word in wf:
                            wf[word] += 1
                        else:
                            wf[word] = 1
                        if tag[:2] == "JJ":
                            if word in jj:
                                jj[word] += 1
                            else:
                                jj[word] = 1
                        elif tag[:2] == "NN":
                            if word in nn:
                                nn[word] += 1
                            else:
                                nn[word] = 1
                        elif tag[:2] == "RB":
                            if word in rb:
                                rb[word] += 1
                            else:
                                rb[word] = 1
            num += 1
Пример #5
0
def classify(no):
    num = 0
    compa_sent_count = 0
    current_id = 0
    try:
        nlp = spacy.load('en')
        matcher = Matcher(nlp.vocab)
        add_patterns(matcher)
        # with io.open(os.path.join(os.pardir, "out", "tech_v5", "{}.txt".format(no)), "r", encoding="utf-8") as data_file:
        with open(os.path.join(os.pardir, "out", "tech_v6", "{}.txt".format(no))) as data_file:
            compa_sent_count = 0
            for line in data_file:
                if num % 4 == 0:
                    current_id = line
                elif num % 4 == 1:
                    tech_pair = line.split("\t")
                    tech_pair[-1] = tech_pair[-1].strip()
                elif num % 4 == 2:
                    tag_list = []
                    # for token in doc:
                    #     tag = token.tag_
                    #     word = token.text
                    # print(line)
                    flag = False
                    for (word, tag) in CoreNLPPOSTagger(url='http://localhost:9000').tag(line.split(" ")):
                    # for (word, tag) in nltk.pos_tag(line.split(" ")):
                        if flag:
                            word = "." + word
                            flag = False
                        if tag == "IN" and word in cin:
                            tag_list.append("CIN")
                        elif word in cv:
                            tag_list.append("CV")
                        elif word in tech_pair:
                            tag_list.append("TECH")
                        elif word == ".":
                            flag = True
                        else:
                            tag_list.append(tag)
                    pos_tag = " ".join(tag_list)
                    patterns = matcher(nlp(pos_tag))
                    if patterns != []:
                        compa_sent_count += 1
                        data_file = open(os.path.join(os.pardir, "out", "tech_v6", "sentences_1.txt"), "a")
                        data_file.write("{}".format(current_id))
                        data_file.write("{}\n".format("\t".join(tech_pair)))
                        for pattern in patterns:
                            data_file.write("pattern"+str(pattern[0])+"\t")
                        data_file.write(str("\n{}\n".format(line)))
                        data_file.close()
                num += 1
    finally:
        print("Proc {}: {}/{} from - to {}".format(os.getpid(), compa_sent_count, num/4, current_id))
Пример #6
0
def create_parse_trees(sentences):
    """ Create Parse tree for each sentence in sentences list and return all trees in a list.
    Takes a sentence as a string; before parsing, it will be automatically tokenized and tagged by the CoreNLP Parser.

    :param sentences: Input sentences for parsing.
    :type sentences: list(str)
    :return: list(Tree)
    """

    # Create Stanford Parser.
    stanford_parser = CoreNLPPOSTagger()

    # Create a list to store all sentences parsed trees.
    parsed_sentences_trees = []

    # Create parsed trees ans store to list.
    for sentence in sentences:
        for line in stanford_parser.raw_parse(sentence):
            temp_tree = Tree.fromstring(str(line))
            parsed_sentences_trees.append(temp_tree)

    return parsed_sentences_trees
Пример #7
0
    def __init__(self, src_folder="../data/", freq_thre=100, corenlp_path="../stanford-corenlp/",\
                 ner_path="../stanford-ner/", verbose=False):

        # initialization
        self.src = os.path.join(src_folder, "reviews/")
        self.corenlp_path = os.path.normpath(corenlp_path) + "/"
        self.stanford_ner_path = os.path.normpath(ner_path) + "/"
        self.frequency_threshold = freq_thre
        self.dst = os.path.join(src_folder, "lexicon/candidates.json")
        self.dst_allReviews = os.path.join(src_folder, "allReviews/")
        self.dst_ner_tsv = os.path.join(src_folder, "ner_tsv/")
        self.dst_ne = os.path.join(src_folder, "ne/")
        self.verbose = verbose

        # pick up sentiment words
        self.pos_tags = ["JJ", "JJR", "JJS", "RB", "RBR", "RBS"]
        self.pos_tagged_statistics = {}

        # it is based on CoreNLP, a new version of stanford pos tagger
        self.pos_tagger = CoreNLPPOSTagger()
        self.stemmer = SnowballStemmer("english")
        self.stopwords = set(stopwords.words("english"))
        # remove `not` because we need combine `not` and sentiment words
        self.stopwords.remove("not")
Пример #8
0
    def modify(self):
        url = "http://localhost:9000/tregex"
        request_params = {"pattern": " SBAR|VP|NP=app $, /,/ "}
        # text = "Mexico City, the biggest city in the world, has many interesting archaeological sites."
        text = self.text
        # print(text)
        r = requests.post(url, data=text, params=request_params)
        json_data = json.loads(r.text)
        text1 = json_data['sentences'][0]['0']['match']
        tree = nltk.Tree.fromstring(text1, read_leaf=lambda x: x.split("/")[0])
        line = tree.leaves()

        appos = ''
        begin_text = ''
        for tag in line:
            appos = appos + tag+' '

        text = text.replace(',', '')
        result = text.index(appos)
        text = text.replace(appos, '')
        for x in range(0,result):
            begin_text = begin_text + text[x]

        doc = nlp(begin_text)
        for ent in doc.ents:
            sub_ent = ent.label_
        if sub_ent == 'GPE' or sub_ent == 'LOC':
            text = text.replace(begin_text, '')
        text1 = CoreNLPPOSTagger(url='http://localhost:9000').tag(text.split())

        for tagg in text1:
            # line = 'She ate the fruits.'
            if tagg[1] == "VBD":
                tense = "was"

            # line = 'We eat the fruits.'
            if tagg[1] == "VBP":
                tense = "is"

            # line = 'She eats the fruits.'
            if tagg[1] == "VBZ":
                tense = "is"

        qts = "Which/Where"
        qts = qts + ' ' + tense + ' ' + appos + '?'
        return qts
class StanfordPOSAnnotator(Annotator):
    def __init__(self, config_path=DEFAULT_CONFIG_PATH):
        self.config = load_config(config_path)
        corenlp_config = self.config["data"]["stanford_corenlp"]
        self.tagger = CoreNLPPOSTagger(
            url="http://%s:%d" %
            (corenlp_config["host"], corenlp_config["port"]))

        self.pos_map = self.config["model"]["STANFORD_POS_MAP"]

    def annotate(self, annotable):
        if (annotable.__class__.__name__ == "Document"):
            return self.annotate_document(annotable)
        elif (annotable.__class__.__name__ == "Sentence"):
            return self.annotate_sentence(annotable)
        else:
            raise AnnotationError(
                "This annotator only accepts Document or Sentence annotables.")

    def annotate_document(self, document):
        for sentence in document.sentences:
            self.annotate_sentence(sentence)

    def annotate_sentence(self, sentence):
        token_list = [token.surface for token in sentence.tokens]
        tagged_tokens = self.tagger.tag(token_list)

        for i in range(len(token_list)):
            sentence.tokens[i].annotations["STANFORD_POS"] = tagged_tokens[i][
                1]

            for pos_rgx in self.pos_map:
                if (re.match(pos_rgx, tagged_tokens[i][1])):
                    sentence.tokens[i].annotations["POS"] = self.pos_map[
                        pos_rgx].split("|")[0]

            if ("POS" not in sentence.tokens[i].annotations):
                sentence.tokens[i].annotations["POS"] = "x"
Пример #10
0
from projectFiles import utils
from projectFiles.Utils import xlsxUtils

import pandas as pd
import numpy as np
from nltk.tag.stanford import CoreNLPPOSTagger
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic

brown_ic = wordnet_ic.ic('ic-brown.dat')
tagger = CoreNLPPOSTagger(url='http://localhost:9000')


def calculate_semantic_sim(word1, word2, pos1='n', pos2='n'):
    list_of_synsets1 = wn.synsets(word1, pos=pos1)
    list_of_synsets2 = wn.synsets(word2, pos=pos2)

    if not list_of_synsets1:
        return 0.0001
    if not list_of_synsets2:
        return 0.0001

    s1 = list_of_synsets1[0]
    s2 = list_of_synsets2[0]

    total_value = 0

    value = s1.wup_similarity(s2)
    value = utils.limit_value(value, 0.0001, 1.0)
    total_value += value
Пример #11
0
from nltk.tag.stanford import CoreNLPNERTagger, CoreNLPPOSTagger
from nltk.tokenize.stanford import CoreNLPTokenizer

stpos, stner = CoreNLPPOSTagger('http://localhost:9001'), CoreNLPNERTagger(
    'http://localhost:9001')
sttok = CoreNLPTokenizer('http://localhost:9001')

sttok.tokenize(u'你好')

stpos.tag(u'basf')

stpos.tag(sttok.tokenize(u'text'))

stner.tag(u'你好')

stner.tag(sttok.tokenize(u'你好'))
Пример #12
0
def main():
    information = {}
    sentences = set()
    for items in relations[pair]:
        sentences.add(items[5])
        information[items[5]] = (items[0], items[1], items[2], items[4])
    sentences = list(sentences)
    l = len(sentences)
    corpus = []
    topics = []
    for sentence in sentences:
        if pos_flag:
            words = sentence.split()
            words[-1] = words[-1].strip()
            tagged_words = CoreNLPPOSTagger(
                url='http://localhost:9000').tag(words)
            if len(words) != len(tagged_words):
                tagged_words = pos_tag(words)
            # print(tagged_words)
            # print(sentence.strip())
            for phrase in stop_phrases:
                n = len(phrase)
                for i in range(len(tagged_words) - n + 1):
                    if phrase == words[i:i + n]:
                        for j in range(i, i + n):
                            tagged_words[j] = (None, tagged_words[j][1])
            i = 0
            indices = []
            keywords = []
            for (word, tag) in tagged_words:
                if word in pair:
                    indices.append(i)
                    keywords.append(word)
                    i += 1
                elif word not in stop_words and tag in pos_tag_set and word is not None:
                    keywords.append(word)
                    i += 1
            # topics.append(" ".join(keywords))
            # topics.append(sentence.strip())
            if len(keywords) <= 10 and flag:
                ws = [w for w in keywords if w not in pair]
            else:
                ws = []
                # if len(indices) == 2:
                #     for j in range(len(keywords)):
                #
                #         if j > indices[0] and j <= indices[0] + 4 and keywords[j] not in pair and j < indices[1]:
                #             ws.append(keywords[j])
                #         elif j >= indices[1] - 2 and j <= indices[1] + 2 and keywords[j] not in pair:
                #             ws.append(keywords[j])
                # else:
                if True:
                    for j in range(len(keywords)):
                        for i in indices:
                            if j >= i - 2 and j <= i + 2 and keywords[
                                    j] not in pair and keywords[j] not in ws:
                                ws.append(keywords[j])
                                break
            # with open(keywords_path, "a") as keywords_file:
            #     keywords_file.write(",".join(ws)+"\n")
            #     keywords_file.write(sentence+"\n")
            corpus.append(ws)
            topics.append(" ".join(ws))
        else:
            corpus.append([w for w in sentence.split() if w not in stop_words])

    if query_flag:
        with open(os.path.join(os.pardir, "keywords", "corpus.pkl"),
                  'wb') as corpus_file:
            pickle.dump(corpus, corpus_file)
        with open(os.path.join(os.pardir, "keywords", "sentences.pkl"),
                  'wb') as sentences_file:
            pickle.dump(sentences, sentences_file)

    else:
        # Prepare word2vector model
        fname = os.path.join(os.pardir, "data", "mymodel")
        model = gensim.models.Word2Vec.load(fname)
        model.init_sims(replace=True)

        # Build weighted graph
        # dictionary = Dictionary(corpus)
        # bow_corpus = [dictionary.doc2bow(document) for document in corpus]

        index = WmdSimilarity(corpus, model)
        G = nx.Graph()
        for i in range(l - 1):
            sims = index[corpus[i]]
            # print("query:")
            # print(corpus[i])
            # print(sentences[i])
            # print("sims:")
            for j in range(i + 1, l):
                # print(sims[j])
                # print(corpus[j])
                # print(sentences[j])
                # print()
                shreshold = set_shreshold(len(corpus[i]), len(corpus[j]))
                if sims[j] >= shreshold:
                    if i not in G: G.add_node(i)
                    if j not in G: G.add_node(j)
                    G.add_edge(i, j)
                    # G.add_edge(i, j, weight=sims[j])

        out_path = os.path.join(
            os.pardir, "{}_{}_{}.txt".format("&".join(pair),
                                             G.number_of_nodes(), l))
        # image_path = os.path.join(os.pardir, com_dir, "{}_{}_{}.png".format("&".join(pair), G.number_of_nodes(), l))

        # Draw graph
        pos = nx.spring_layout(G)
        plt.figure(figsize=(19, 12))
        plt.axis('off')
        nx.draw_networkx_nodes(G, pos, node_size=50)
        nx.draw_networkx_edges(G, pos, width=0.75)
    #first compute the best partition
    communities = []
    partition = community.best_partition(G)
    for com in set(partition.values()):
        list_nodes = [
            nodes for nodes in partition.keys() if partition[nodes] == com
        ]
        communities.append(list_nodes)

    num = 0
    graph_indices = set()
    bloblist = []
    clusters = []
    for com in communities:
        if len(com) > 1:
            doc = ""
            for i in com:
                doc += topics[i] + " "
            bloblist.append(tb(doc))
            clusters.append(com)

    aspects[pair] = set()
    new_aspects[pair] = {}
    # if True:
    with open(out_path, "a") as out_file:
        for i, blob in enumerate(bloblist):
            # print("Top words in document {}".format(i + 1))
            scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
            sorted_words = sorted(scores.items(),
                                  key=lambda x: x[1],
                                  reverse=True)
            # word_num = 0
            aspect_keywords = []
            for word, score in sorted_words[:3]:
                out_file.write(word + ", ")
                aspect_keywords.append(word)
            new_aspects[pair][" ".join(aspect_keywords)] = set()
            # for word, score in sorted_words:
            #     if word_num == 3:
            #         break
            #     if tf(word, blob) >= 0.2:
            #         word_num += 1
            #         out_file.write(word+", ")
            #         print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
            out_file.write(
                "---------------------------------------------------\n\n")
            for j in clusters[i]:
                temp = information[sentences[j]]
                new_aspects[pair][" ".join(aspect_keywords)].add(
                    (temp[0], temp[1], temp[2], temp[3], sentences[j]))
                aspects[pair].add(
                    (temp[0], temp[1], temp[2], " ".join(aspect_keywords),
                     temp[3], sentences[j]))
                out_file.write(",".join(corpus[j]) + "\n")
                out_file.write(sentences[j] + "\n")
                graph_indices.add(j)
            num += 1
        out_file.write(
            "other---------------------------------------------------\n\n")
        new_aspects[pair]["other"] = set()
        for j in range(len(sentences)):
            if j not in graph_indices:
                temp = information[sentences[j]]
                new_aspects[pair]["other"].add(
                    (temp[0], temp[1], temp[2], temp[3], sentences[j]))
                aspects[pair].add(
                    (temp[0], temp[1], temp[2], "", temp[3], sentences[j]))

                out_file.write(",".join(corpus[j]) + "\n")
                out_file.write(sentences[j] + "\n")
    plt.close('all')
Пример #13
0
import nltk
from nltk.parse.corenlp import CoreNLPParser
from nltk.tag.stanford import CoreNLPPOSTagger
from pycorenlp import StanfordCoreNLP

from brain import memory
from brain.conjugator import conjugator

DEFAULT_TAGS = ['NNP', 'NNPS', 'NN', 'NNS']
KEYWORD_TAGS = DEFAULT_TAGS[:]
KEYWORD_TAGS.extend(['VBG'])
#KEYWORD_TAGS.extend(['VBG', 'PRP', 'PRP$', 'WP', 'WP$', 'WRB', 'WDT'])

STANFORD_TAGGER = CoreNLPPOSTagger('http://localhost:9000/')
STANFORD_SERVER = StanfordCoreNLP('http://localhost:9000/')
STANFORD_PARSER = CoreNLPParser('http://localhost:9000/')


def combine_similar(input, tags):
    output = []
    curr = []
    tag = ""
    for x in input:
        if x[1] not in tags:
            if len(curr) > 0:
                output.append((" ".join([x[0] for x in curr]), tag))
                curr[:] = []
                tag = ""
            output.append(x)
        elif x[1] == tag:
            curr.append(x)
Пример #14
0
    def modify(self):
        url = "http://localhost:9000/tregex"
        request_params = {
            "pattern":
            " RB=n1  > (ADVP >> (S=n2 > ROOT)) | > (ADJP >> (S=n2 > ROOT))"
        }
        text = self.text
        # print(text)
        r = requests.post(url, data=text, params=request_params)
        json_data = json.loads(r.text)
        text1 = json_data['sentences'][0]['0']['match']
        tree = nltk.Tree.fromstring(text1, read_leaf=lambda x: x.split("/")[0])
        str1 = tree.leaves()
        adverb = ' '.join(str1)

        text_pos = CoreNLPPOSTagger(url='http://localhost:9000').tag(
            text.split())
        #print(text_pos)
        c = 0
        for tagg in text_pos:
            if (c == 0 and tagg[1] != "NNP" and tagg[0] != 'I'):
                s = tagg[0].lower()
                text = text.replace(tagg[0], s)
            #line = 'He ran quickly.'
            if tagg[1] == "VBD" and text_pos[c][0] != 'had' and text_pos[
                    c + 1][1] != 'VBG':
                verb_tense = "did"
                root_verb = WordNetLemmatizer().lemmatize(tagg[0], 'v')
                text = text.replace(tagg[0], root_verb)

            #line = 'I run quickly.'
            if tagg[1] == "VBP" and text_pos[c][0] != 'is' and text_pos[c][
                    0] != 'are' and text_pos[c][0] != 'have':
                verb_tense = "do"
                root_verb = WordNetLemmatizer().lemmatize(tagg[0], 'v')
                text = text.replace(tagg[0], root_verb)

            #line = 'John runs quickly.'
            if tagg[1] == "VBZ" and text_pos[c + 1][1] != 'VBN' and text_pos[
                    c + 1][1] != 'VBG':
                verb_tense = "does"
                root_verb = WordNetLemmatizer().lemmatize(tagg[0], 'v')
                text = text.replace(tagg[0], root_verb)

            #line = 'John is playing quietly.'
            #line = 'John was playing quietly.'
            #line = 'John is going to play quietly.'

            if tagg[1] == "VBG" and text_pos[c - 1][1] != 'VB' and text_pos[
                    c - 1][1] != 'VBN':
                verb_tense = text_pos[c - 1][0]
                text = text.replace(text_pos[c - 1][0] + " ", "")

            #line = 'John has ran quickly.'
            if tagg[1] == "VBZ" and text_pos[
                    c + 1][1] == 'VBN' and text_pos[c + 2][1] != "VBG":
                verb_tense = text_pos[c][0]
                text = text.replace(text_pos[c][0] + " ", "")

            #line = 'John will be playing quietly.'
            if tagg[1] == "VBG" and text_pos[c - 1][1] == 'VB':
                verb_tense = text_pos[c - 2][0]
                text = text.replace(text_pos[c - 2][0] + " ", "")

            #line = 'John has been playing quietly.'
            #line = 'John had been playing quietly.'
            if (tagg[1] == "VBZ" or tagg[1] == "VBD") and text_pos[
                    c + 1][1] == 'VBN' and text_pos[c + 2][1] == 'VBG':
                verb_tense = text_pos[c][0]
                text = text.replace(text_pos[c][0] + " ", "")

            #line = 'John had left quietly.'
            #line = 'We have eaten the meal quietly.'
            if tagg[1] == "VBN" and tagg[0] != 'been' and (
                    text_pos[c - 1][0] == 'had' or text_pos[c - 1][0]
                    == 'have') and text_pos[c - 2][1] != 'MD':
                verb_tense = text_pos[c - 1][0]
                text = text.replace(text_pos[c - 1][0] + " ", "")

            #line = 'John will run quickly.'
            #line = 'John would have ran quickly.'
            if tagg[1] == "MD" and text_pos[c + 1][1] == 'VB':
                verb_tense = text_pos[c][0]
                text = text.replace(text_pos[c][0] + " ", "")
            c = c + 1
        """
        obj=""
        for i in line:
            classified_text = st.tag(word_tokenize(i))
            if classified_text[0][1]!='PERSON':
                break
            obj = obj + classified_text[0][0]+ " "
        """
        text = text.replace(".", " ?")
        text = text.replace(adverb, "")
        Q = 'How ' + verb_tense + ' ' + text
        return Q
Пример #15
0
class SelectCandidates:
    """ This program aims to select candidate words from reviews
        We picks up sentiment words and handles the negation problem
        The result will be stored in `src_folder/lexicon/candidates.json`
    """

    def __init__(self, src_folder="../data/", freq_thre=100, corenlp_path="../stanford-corenlp/",\
                 ner_path="../stanford-ner/", verbose=False):

        # initialization
        self.src = os.path.join(src_folder, "reviews/")
        self.corenlp_path = os.path.normpath(corenlp_path) + "/"
        self.stanford_ner_path = os.path.normpath(ner_path) + "/"
        self.frequency_threshold = freq_thre
        self.dst = os.path.join(src_folder, "lexicon/candidates.json")
        self.dst_allReviews = os.path.join(src_folder, "allReviews/")
        self.dst_ner_tsv = os.path.join(src_folder, "ner_tsv/")
        self.dst_ne = os.path.join(src_folder, "ne/")
        self.verbose = verbose

        # pick up sentiment words
        self.pos_tags = ["JJ", "JJR", "JJS", "RB", "RBR", "RBS"]
        self.pos_tagged_statistics = {}

        # it is based on CoreNLP, a new version of stanford pos tagger
        self.pos_tagger = CoreNLPPOSTagger()
        self.stemmer = SnowballStemmer("english")
        self.stopwords = set(stopwords.words("english"))
        # remove `not` because we need combine `not` and sentiment words
        self.stopwords.remove("not")

    def stanford_ner(self):
        """ call stanford java ner api """

        self.merge_reviews()
        self.run_ner()
        self.find_named_entity()

    def merge_reviews(self):
        """ merge all reviews for named entity recognition """

        if self.verbose:
            print "Merging all reviews for named entity recognition" + "\n" + "-" * 80

        self.create_dir(self.dst_allReviews)

        for dirpath, dirs, files in os.walk(self.src):
            for f in files:
                filename = re.search(
                    "([A-Za-z|.]+\-*[A-Za-z|.]+\-*[A-Za-z|.]+\_.*).json",
                    f).group(1)
                data = json.load(open(os.path.join(dirpath, f)))
                with open(os.path.join(self.dst_allReviews, filename + ".txt"),
                          "w+") as rf:
                    for r in data["reviews"]:

                        text = r["review"]
                        # remove accents
                        text = unicodedata.normalize("NFKD", text).encode(
                            "ASCII", "ignore")
                        # remove all website urls written in the review
                        text = re.sub(r"https?:\/\/.*[\r\n]*",
                                      " ",
                                      text,
                                      flags=re.MULTILINE)
                        # remove non english letters or words and numbers
                        text = re.sub(
                            r"[^a-zA-Z!@#$%^&*():;/\\<>\"+_\-.,?=\s\|\']", "",
                            text)
                        # remove extra nextline
                        text = re.sub("(\\n)+", r" ", text)

                        # I'm -> I am
                        text = re.sub(r"'m ", " am ", text)
                        text = re.sub(r"'re ", " are ", text)
                        text = re.sub(r"'s ", " is ", text)
                        text = re.sub(r"'ve ", " have ", text)
                        text = re.sub(r"'d ", " would ", text)
                        text = re.sub(r" won't ", " will not ", text)
                        text = re.sub(r"n't ", " not ", text)
                        text = re.sub(r"'ll ", " will ", text)

                        # remove all punctuations except for , . ? ! ; : and -
                        # -: composite adj.
                        text = re.sub("[^\w\s,.?!;:\-]|\_", r" ", text)

                        # Space out every sign & symbol & punctuation
                        text = re.sub("([^\w\s])", r" \1 ", text)

                        text = text.replace("\'", "")
                        # remove ` - `, ` -`, `- `
                        text = re.sub(r"(\-)+", "-", text)
                        text = re.sub(
                            r"(\s)+\-(\s)+|(\s)+\-|\-(\s)+|(\A)\-|\-(\Z)", " ",
                            text)
                        # turn multiple spaces into one
                        text = re.sub(r"(\s)+", " ", text)
                        # remove extra space at both ends of the text
                        text = text.strip()

                        rf.write(text)
                        rf.write("\n\n. CHANGE-REVIEW .\n\n")

    def run_ner(self):
        """ run shell to call NER """

        if self.verbose:
            print "Running shell to call Stanford NER" + "\n" + "-" * 80

        self.create_dir(self.dst_ner_tsv)

        comm = "java -mx1g -cp \"%s*:%slib/*\" edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier %sclassifiers/english.all.3class.distsim.crf.ser.gz -outputFormat tabbedEntities -textFile %s > %s"
        for dirpath, dirs, files in os.walk(self.dst_allReviews):
            for f in files:
                filename = re.search(
                    "([A-Za-z|.]+\-*[A-Za-z|.]+\-*[A-Za-z|.]+\_.*).txt",
                    f).group(1)
                src_file = os.path.join(dirpath, f)
                dst_file = os.path.join(self.dst_ner_tsv, filename + ".tsv")
                command = comm % (self.stanford_ner_path,
                                  self.stanford_ner_path,
                                  self.stanford_ner_path, src_file, dst_file)
                subprocess.call(command, shell=True)

    def find_named_entity(self):
        """ find named entity from the ner tsv """

        if self.verbose:
            print "Finding named entity from ner tsv files" + "\n" + "-" * 80

        self.create_dir(self.dst_ne)

        for dirpath, dirs, files in os.walk(self.dst_ner_tsv):
            for f in files:
                filename = re.search(
                    "([A-Za-z|.]+\-*[A-Za-z|.]+\-*[A-Za-z|.]+\_.*).tsv",
                    f).group(1)
                src_file = os.path.join(dirpath, f)
                dst_file = os.path.join(self.dst_ne, filename + ".txt")
                rs = [set()]

                with open(src_file, "rb") as tsvin:
                    data = csv.reader(tsvin, delimiter="\t")
                    for r in data:
                        if len(r) != 0 and r[0] != "":
                            if r[1] == "ORGANIZATION" or r[1] == "PERSON" or r[
                                    1] == "LOCATION":
                                l = r[0].split(" ")
                                for i in l:
                                    if (i, r[1]) not in rs:
                                        rs[-1].add((i, r[1]))
                        elif len(r) > 2 and "CHANGE-REVIEW" in r[2]:
                            rs.append(set())

                with open(dst_file, "w+") as rf:
                    for rs_index in range(len(rs) - 1):
                        rf.write(str(rs_index) + ",FILEINDEX\n")
                        for i in rs[rs_index]:
                            rf.write(i[0] + "," + i[1] + "\n")

    def get_sentiment_words(self):
        """ load all reviews in src folder: data/reviews/ and merge them """

        # start Stanford CoreNLP server in a new process
        comm = "java -mx4g -cp \"%s*\" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos -status_port 9000 -port 9000 -timeout 50000"
        command = comm % (self.corenlp_path)
        proc = subprocess.Popen(command, shell=True, preexec_fn=os.setsid)
        time.sleep(10)  # wait for starting Stanford CoreNLP server

        for dirpath, dir_list, file_list in os.walk(self.src):
            if self.verbose:
                print "Walking into directory: " + str(dirpath)

            if len(file_list) > 0:
                for f in file_list:
                    # in case there is a goddamn .DS_Store file
                    if str(f) == ".DS_Store":
                        if self.verbose:
                            print "Removing " + dirpath + "/" + str(f)
                        os.remove(os.path.join(dirpath, f))
                    else:
                        with open(os.path.join(dirpath, f)) as fp:
                            entity = json.load(fp)

                    if self.verbose:
                        print "Processing " + "\033[1m" + entity[
                            "entity"] + "\033[0m" + " in " + "\033[1m" + entity[
                                "category"] + "\033[0m"
                    self.analyze_part_of_speech(entity["reviews"], f)
            else:
                if self.verbose:
                    print "No file is found in " + str(dirpath)

        os.killpg(os.getpgid(proc.pid), signal.SIGTERM)

        if self.verbose:
            print "Part of Speech Analysis on Reviews are Done"
            print "-" * 80

    def analyze_part_of_speech(self, reviews, filename):
        """ run nltk.pos_tag to analysis the part_of_speech of every word """

        ner_set = self.load_ner_tags(filename)

        for review_index in range(len(reviews)):

            text = reviews[review_index]["review"]
            # remove accents
            text = unicodedata.normalize("NFKD",
                                         text).encode("ASCII", "ignore")
            # remove all website urls written in the review
            text = re.sub(r"https?:\/\/.*[\r\n]*",
                          " ",
                          text,
                          flags=re.MULTILINE)
            # remove non english letters or words and numbers
            text = re.sub(r"[^a-zA-Z!@#$%^&*():;/\\<>\"+_\-.,?=\s\|\']", "",
                          text)
            # remove extra nextline
            text = re.sub("(\\n)+", r" ", text)

            # I'm -> I am
            text = re.sub(r"'m ", " am ", text)
            text = re.sub(r"'re ", " are ", text)
            text = re.sub(r"'s ", " is ", text)
            text = re.sub(r"'ve ", " have ", text)
            text = re.sub(r"'d ", " would ", text)
            text = re.sub(r" won't ", " will not ", text)
            text = re.sub(r"n't ", " not ", text)
            text = re.sub(r"'ll ", " will ", text)

            # remove all punctuations except for , . ? ! ; : and -
            # -: composite adj.
            text = re.sub("[^\w\s,.?!;:\-]|\_", r" ", text)

            # space out every sign & symbol & punctuation
            text = re.sub("([^\w\s])", r" \1 ", text)

            text = text.replace("\'", "")
            # remove ` - `, ` -`, `- `
            text = re.sub(r"(\-)+", "-", text)
            text = re.sub(r"(\s)+\-(\s)+|(\s)+\-|\-(\s)+|(\A)\-|\-(\Z)", " ",
                          text)
            # turn multiple spaces into one
            text = re.sub(r"(\s)+", " ", text)
            # remove extra space at both ends of the text
            text = text.strip()

            # tokenize
            tokenized_text = text.split(" ")
            # remove empty string
            tokenized_text = [w for w in tokenized_text if w]

            # pos tag
            # a list of word tuples # [("great", "JJ"), ("tour", "NN") ...]
            if len(tokenized_text) == 0:
                continue
            word_tuple_list = self.pos_tagger.tag(tokenized_text)

            # remove stop_words
            word_tuple_list = [(w[0].lower(), w[1]) for w in word_tuple_list
                               if w[0].lower() not in self.stopwords]
            # remove empty string
            word_tuple_list = [(w[0], w[1]) for w in word_tuple_list if w[0]]

            combine_or_not = False
            combination_front = ""
            for word_tuple in word_tuple_list:
                # putting them into dictionary
                # add 1 to value if exist
                # add key and value if not
                if word_tuple[1] not in self.pos_tags:
                    if combine_or_not:
                        if combination_front in self.pos_tagged_statistics:
                            self.pos_tagged_statistics[combination_front] += 1
                        else:
                            self.pos_tagged_statistics[combination_front] = 1
                        combine_or_not = False
                        combination_front = ""
                elif word_tuple[0] not in ner_set[review_index]:
                    if combine_or_not:
                        if combination_front:
                            combination_front += "_" + word_tuple[0]
                        else:
                            combination_front = word_tuple[0]
                    else:
                        combine_or_not = True
                        combination_front = word_tuple[0]
            if combine_or_not:
                if combination_front in self.pos_tagged_statistics:
                    self.pos_tagged_statistics[combination_front] += 1
                else:
                    self.pos_tagged_statistics[combination_front] = 1

    def stem(self, candidate_lexicon):
        """ perform stemming on candidate lexicon | candidate lexicon should be a list """

        stemmed_lexicon = []
        for word in candidate_lexicon:
            stemmed_word = self.stemmer.stem(word)
            stemmed_lexicon.append({
                "word": word,
                "stemmed_word": stemmed_word
            })
        stemmed_lexicon = sorted(stemmed_lexicon, key=lambda k: k['word'])

        if self.verbose:
            print "\nMerging stemmed duplicates"
        processed_lexicon = {}
        length = len(stemmed_lexicon)
        cnt = 0
        for word_dict in stemmed_lexicon:
            cnt += 1
            if word_dict["stemmed_word"] not in processed_lexicon:
                processed_lexicon[word_dict["stemmed_word"]] = [
                    word_dict["word"]
                ]
            else:
                processed_lexicon[word_dict["stemmed_word"]].append(
                    word_dict["word"])
            if self.verbose:
                sys.stdout.write("\rStatus: %s / %s" % (cnt, length))
                sys.stdout.flush()

        processed_lexicon = [{
            "stemmed_word": key,
            "word": value
        } for key, value in processed_lexicon.iteritems()]
        # sorting dictionaries by word
        processed_lexicon = sorted(processed_lexicon,
                                   key=lambda k: k["stemmed_word"])

        return processed_lexicon

    def load_ner_tags(self, filename):
        """ load named entity for files """

        filename = re.search(
            "([A-Za-z|.]+\-*[A-Za-z|.]+\-*[A-Za-z|.]+\_.*).json",
            filename).group(1)
        ner_set = []
        with open(os.path.join(self.dst_ne, filename + ".txt"), "rb") as ne_f:
            tags = csv.reader(ne_f, delimiter=",")
            for tag in tags:
                if tag[1] == "FILEINDEX":
                    ner_set.append(set())
                else:
                    ner_set[-1].add(tag[0].lower())
        return ner_set

    def render_candidate_lexicon(self):
        """ render the candidate words """

        # filtered by self.frequency_threshold
        if self.verbose:
            print "Filtering out frequency lower than frequency_threshold" + "\n" + "-" * 80

        self.create_dir(self.dst)

        pos_tagged_words = []
        pos_tagged_words_under_thre = []
        for key in self.pos_tagged_statistics:
            if self.pos_tagged_statistics[key] > self.frequency_threshold:
                pos_tagged_words.append(key)
            else:
                pos_tagged_words_under_thre.append(key)

        if self.verbose:
            print "Stemming candidate words"
        pos_tagged_words = self.stem(pos_tagged_words)
        pos_tagged_words_under_thre = self.stem(pos_tagged_words_under_thre)

        ordered_dict_list = [[], []]
        if self.verbose:
            print "\nOrganizing candidate words"
        length = len(pos_tagged_words)
        for index in range(len(pos_tagged_words)):
            ordered_dict = OrderedDict()
            ordered_dict["index"] = index + 1
            ordered_dict["count"] = sum([
                self.pos_tagged_statistics[w]
                for w in pos_tagged_words[index]["word"]
            ])
            ordered_dict["stemmed_word"] = pos_tagged_words[index][
                "stemmed_word"]
            ordered_dict["word"] = pos_tagged_words[index]["word"]
            ordered_dict_list[0].append(NoIndent(ordered_dict))

            if self.verbose:
                sys.stdout.write("\rStatus: %s / %s" % (index + 1, length))
                sys.stdout.flush()

        if self.verbose:
            print "\nOrganizing candidate words <= frequency threshold"
        length = len(pos_tagged_words_under_thre)
        for index in range(len(pos_tagged_words_under_thre)):
            ordered_dict = OrderedDict()
            ordered_dict["index"] = index + 1
            ordered_dict["count"] = sum([
                self.pos_tagged_statistics[w]
                for w in pos_tagged_words_under_thre[index]["word"]
            ])
            ordered_dict["stemmed_word"] = pos_tagged_words_under_thre[index][
                "stemmed_word"]
            ordered_dict["word"] = pos_tagged_words_under_thre[index]["word"]
            ordered_dict_list[1].append(NoIndent(ordered_dict))

            if self.verbose:
                sys.stdout.write("\rStatus: %s / %s" % (index + 1, length))
                sys.stdout.flush()

        if self.verbose:
            print "\n" + "-" * 80
            print "Saving data to: \033[1m" + self.dst + "\033[0m"
        with open(self.dst, "w+") as f_out:
            f_out.write(
                json.dumps(ordered_dict_list, indent=4, cls=NoIndentEncoder))

    def create_dir(self, new_path):
        """ create the directory if not exist"""

        dir1 = os.path.dirname(new_path)
        if not os.path.exists(dir1):
            if self.verbose:
                print "Creating directory: " + dir1
                print "-" * 80
            os.makedirs(dir1)

    def run(self):
        print "Selecting candidate words" + "\n" + "-" * 80

        self.stanford_ner()
        self.get_sentiment_words()
        self.render_candidate_lexicon()

    def PrintException(self):
        exc_type, exc_obj, tb = sys.exc_info()
        f = tb.tb_frame
        lineno = tb.tb_lineno
        filename = f.f_code.co_filename
        linecache.checkcache(filename)
        line = linecache.getline(filename, lineno, f.f_globals)
        print '    Exception in ({}, LINE {} "{}"): {}'.format(
            filename, lineno, line.strip(), exc_obj)
Пример #16
0
def genQuestion(line):
    """
    outputs question from the given text
    """
    bucket = {}  # Create an empty dictionary

    # POS tagging
    text = CoreNLPPOSTagger(url='http://localhost:9000').tag(line.split())
    for i, j in enumerate(text):  # text is the parts-of-speach tags in English
        if j[1] not in bucket:
            bucket[
                j[1]] = i  # Add all tags to the dictionary or bucket variable

    if type(line) is str:  # If the passed variable is of type string.
        line = TextBlob(line)  # Create object of type textblob.blob.TextBlob

    question = ''
    l1 = ['NNP', 'VBG', 'VBZ', 'IN']
    l2 = ['NNP', 'VBG', 'VBZ']

    l3 = ['PRP', 'VBG', 'VBZ', 'IN']
    l4 = ['PRP', 'VBG', 'VBZ']
    l5 = ['PRP', 'VBG', 'VBD']
    l6 = ['NNP', 'VBG', 'VBD']
    l7 = ['NN', 'VBG', 'VBZ']

    l8 = ['NNP', 'VBZ', 'JJ']
    l9 = ['NNP', 'VBZ', 'NN']

    l10 = ['NNP', 'VBZ']
    l11 = ['PRP', 'VBZ']
    l12 = ['NNP', 'NN', 'IN']
    l13 = ['NN', 'VBZ']

    # With the use of conditional statements the dictionary is compared with the list created above

    if all(key in bucket
           for key in l1):  #'NNP', 'VBG', 'VBZ', 'IN' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[
            bucket['NNP']] + ' ' + line.words[bucket['VBG']] + '?'

    elif all(key in bucket for key in l2):  #'NNP', 'VBG', 'VBZ' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[
            bucket['NNP']] + ' ' + line.words[bucket['VBG']] + '?'

    elif all(key in bucket
             for key in l3):  #'PRP', 'VBG', 'VBZ', 'IN' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[
            bucket['PRP']] + ' ' + line.words[bucket['VBG']] + '?'

    elif all(key in bucket for key in l4):  #'PRP', 'VBG', 'VBZ' in sentence.
        question = 'What ' + line.words[
            bucket['PRP']] + ' ' + ' does ' + line.words[
                bucket['VBG']] + ' ' + line.words[bucket['VBG']] + '?'

    elif all(key in bucket for key in l7):  #'NN', 'VBG', 'VBZ' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[
            bucket['NN']] + ' ' + line.words[bucket['VBG']] + '?'

    elif all(key in bucket for key in l8):  #'NNP', 'VBZ', 'JJ' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[
            bucket['NNP']] + '?'

    elif all(key in bucket for key in l9):  #'NNP', 'VBZ', 'NN' in sentence
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[
            bucket['NNP']] + '?'

    elif all(key in bucket for key in l11):  #'PRP', 'VBZ' in sentence.
        if line.words[bucket['PRP']] in ['she', 'he']:
            question = 'What' + ' does ' + line.words[bucket['PRP']].lower(
            ) + ' ' + line.words[bucket['VBZ']].singularize() + '?'

    elif all(key in bucket for key in l10):  #'NNP', 'VBZ' in sentence.
        question = 'What' + ' does ' + line.words[bucket[
            'NNP']] + ' ' + line.words[bucket['VBZ']].singularize() + '?'

    elif all(key in bucket for key in l13):  #'NN', 'VBZ' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[
            bucket['NN']] + '?'

    # When the tags are generated 's is split to ' and s. To overcome this issue.
    if 'VBZ' in bucket and line.words[bucket['VBZ']] == "’":
        question = question.replace(" ’ ", "'s ")

    # Print the genetated questions as output.
    if question != '':
        print('\n', 'Question: ' + question)
Пример #17
0
        'ORTH': 'JJR'
    }])


nlp = spacy.load('en')
matcher = Matcher(nlp.vocab)
add_patterns(matcher)

tech_pair = ["sortedlist", "sorteddictionary"]
tags = []
line = input(">>>")
while (line != "/"):
    flag = False
    tag_list = []
    words = line.split()
    tagged_words = CoreNLPPOSTagger(url='http://localhost:9000').tag(words)
    if len(words) != len(tagged_words):
        tagged_words = pos_tag(words)
    for (word, tag) in tagged_words:
        # for (word, tag) in CoreNLPPOSTagger(url='http://localhost:9000').tag(line.split()):
        if flag:
            word = "." + word
            flag = False
        if tag == "IN" and word in cin:
            tag_list.append("CIN")
        elif word in cv:
            tag_list.append("CV")
        elif word in tech_pair:
            tag_list.append("TECH")
        elif word == ".":
            flag = True
def main():
    information = {}
    sentences = set()
    for items in relations[pair]:
        sentences.add(items[5])
        information[items[5]] = (items[0], items[1], items[2], items[4])
    sentences = list(sentences)
    l = len(sentences)
    corpus = []
    topics = []
    for sentence in sentences:
        if pos_flag:
            words = sentence.split()
            words[-1] = words[-1].strip()
            tagged_words = CoreNLPPOSTagger(url='http://localhost:9000').tag(words)
            if len(words) != len(tagged_words):
                tagged_words = pos_tag(words)
            # print(tagged_words)
            # print(sentence.strip())
            for phrase in stop_phrases:
                n = len(phrase)
                for i in range(len(tagged_words) - n + 1):
                    if phrase == words[i:i+n]:
                        for j in range(i, i+n):
                            tagged_words[j] = (None, tagged_words[j][1])
            i = 0
            indices = []
            keywords = []
            for (word, tag) in tagged_words:
                if word in pair:
                    indices.append(i)
                    keywords.append(word)
                    i += 1
                elif word not in stop_words and tag in pos_tag_set and word is not None:
                    keywords.append(word)
                    i += 1
            # topics.append(" ".join(keywords))
            # topics.append(sentence.strip())
            if len(keywords) <= 10 and flag:
                ws = [w for w in keywords if w not in pair]
            else:
                ws = []
                # if len(indices) == 2:
                #     for j in range(len(keywords)):
                #
                #         if j > indices[0] and j <= indices[0] + 4 and keywords[j] not in pair and j < indices[1]:
                #             ws.append(keywords[j])
                #         elif j >= indices[1] - 2 and j <= indices[1] + 2 and keywords[j] not in pair:
                #             ws.append(keywords[j])
                # else:
                if True:
                    for j in range(len(keywords)):
                        for i in indices:
                            if j >= i - 2 and j <= i + 2 and keywords[j] not in pair and keywords[j] not in ws:
                                ws.append(keywords[j])
                                break
            # with open(keywords_path, "a") as keywords_file:
            #     keywords_file.write(",".join(ws)+"\n")
            #     keywords_file.write(sentence+"\n")
            corpus.append(ws)
            topics.append(" ".join(ws))
        else:
            corpus.append([w for w in sentence.split() if w not in stop_words])

    if query_flag:
        with open(os.path.join(os.pardir, "keywords", "corpus.pkl"), 'wb') as corpus_file:
            pickle.dump(corpus, corpus_file)
        with open(os.path.join(os.pardir, "keywords", "sentences.pkl"), 'wb') as sentences_file:
            pickle.dump(sentences, sentences_file)

    else:
        # Prepare word2vector model
        fname = os.path.join(os.pardir, "data", "mymodel")
        model = gensim.models.Word2Vec.load(fname)
        model.init_sims(replace=True)

        # Build weighted graph
        # dictionary = Dictionary(corpus)
        # bow_corpus = [dictionary.doc2bow(document) for document in corpus]

        index = WmdSimilarity(corpus, model)


        def set_shreshold(a, b):
            if ver_flag:
                if a == b:
                    return 0.52
                return 0.55 - 0.05 ** abs(a - b)
            else:
                if a == b:
                    return 0.55
                elif a > 3 or b > 3:
                    return 0.55 - 0.1 ** abs(a - b)
                return 0.55 - 0.05 ** abs(a - b)

        G = nx.Graph()
        for i in range(l - 1):
            sims = index[corpus[i]]
            # print("query:")
            # print(corpus[i])
            # print(sentences[i])
            # print("sims:")
            for j in range(i + 1, l):
                # print(sims[j])
                # print(corpus[j])
                # print(sentences[j])
                # print()
                shreshold = set_shreshold(len(corpus[i]), len(corpus[j]))
                if sims[j] >= shreshold:
                    if i not in G: G.add_node(i)
                    if j not in G: G.add_node(j)
                    G.add_edge(i, j)
                    # G.add_edge(i, j, weight=sims[j])


        out_path = os.path.join(os.pardir, com_dir, "{}_{}_{}.txt".format("&".join(pair), G.number_of_nodes(), l))
        image_path = os.path.join(os.pardir, com_dir, "{}_{}_{}.png".format("&".join(pair), G.number_of_nodes(), l))

        # Draw graph
        pos = nx.spring_layout(G)
        plt.figure(figsize=(19,12))
        plt.axis('off')
        nx.draw_networkx_nodes(G, pos, node_size=50)
        nx.draw_networkx_edges(G, pos, width=0.75)
        plt.savefig(image_path)
        # plt.show()

        nnodes = G.number_of_nodes()

        if nnodes < 4:
            communities = []
            communities.append(G.nodes())
        elif nnodes <= 15:
            communities_generator = community.girvan_newman(G)
            temp_communities = next(communities_generator)
            communities = sorted(map(sorted, temp_communities))
        else:
            if nnodes < 50:
                part = 2 / 3
            else:
                part = 1 / 3
            # Detect communities
            communities_generator = community.girvan_newman(G)
            div_flag = True
            while div_flag:
                temp_communities = next(communities_generator)
                communities = sorted(map(sorted, temp_communities))
                div_flag = False
                for com in communities:
                    if len(com) > l * part:
                        div_flag = True
                        break
        num = 0
        graph_indices = set()
        bloblist = []
        clusters = []
        for com in communities:
            if len(com) > 1:
                doc = ""
                for i in com:
                    doc += topics[i] + " "
                bloblist.append(tb(doc))
                clusters.append(com)


        with open(out_path, "a") as out_file:
            for i, blob in enumerate(bloblist):
                print("Top words in document {}".format(i + 1))
                scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
                sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
                word_num = 0
                for word, score in sorted_words:
                    if word_num == 3:
                        break
                    if tf(word, blob) >= 0.2:
                        word_num += 1
                        out_file.write(word+", ")
                        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
                out_file.write("---------------------------------------------------\n\n")
                for j in clusters[i]:
                    out_file.write(",".join(corpus[j])+"\n")
                    out_file.write(sentences[j]+"\n")
                    graph_indices.add(j)
                num += 1
            out_file.write("other---------------------------------------------------\n\n")
            for j in range(len(sentences)):
                if j not in graph_indices:
                    out_file.write(",".join(corpus[j])+"\n")
                    out_file.write(sentences[j]+"\n")
Пример #19
0
from nltk.tag.stanford import CoreNLPPOSTagger
from nltk.tag.stanford import CoreNLPNERTagger
from nltk.stem.wordnet import WordNetLemmatizer

url = "http://localhost:9000/tregex"
request_params = {"pattern": " NP=n1 !>> NP >> (VP > (S=n2 > ROOT)) "}
text = "John would have loved Anne."
print(text)

r = requests.post(url, data=text, params=request_params)
json_data = json.loads(r.text)
text1 = json_data['sentences'][0]['0']['match']
tree = nltk.Tree.fromstring(text1, read_leaf=lambda x: x.split("/")[0])
line = tree.leaves()

text_pos = CoreNLPPOSTagger(url='http://localhost:9000').tag(text.split())
c = 0
for tagg in text_pos:
    #line = 'John loved Anne.'
    if tagg[1] == "VBD" and text_pos[c][0] != 'had' and text_pos[
            c + 1][1] != 'VBG':
        verb_tense = "did"
        root_verb = WordNetLemmatizer().lemmatize(tagg[0], 'v')
        text = text.replace(tagg[0], root_verb)

    #line = 'John love Anne.'
    if tagg[1] == "VBP" and text_pos[c][0] != 'is' and text_pos[c][
            0] != 'are' and text_pos[c][0] != 'have':
        verb_tense = "do"
        root_verb = WordNetLemmatizer().lemmatize(tagg[0], 'v')
        text = text.replace(tagg[0], root_verb)
Пример #20
0
from nltk.tag.stanford import CoreNLPPOSTagger
from nltk.parse.corenlp import CoreNLPDependencyParser
import spacy
from spacy.matcher import Matcher

# dep_parser = CoreNLPDependencyParser(url='http://localhost:9000')
#
# while True:
#     parse, = dep_parser.raw_parse(input(">>>"))
#
#     for governor, dep, dependent in parse.triples():
#         print(governor, dep, dependent)

line = input(">>>")
print(line.split(" "))
for (word, tag) in CoreNLPPOSTagger(url='http://localhost:9000').tag(
        line.split(" ")):
    print(word)
    print(tag)
print(tag)
Пример #21
0
    def modify(self):
        url = "http://localhost:9000/tregex"
        request_params = {"pattern":  " NP=n1 !>> NP >> (VP > (S=n2 > ROOT)) "  }
        text = self.text
        r = requests.post(url, data=text, params=request_params)
        json_data = json.loads(r.text)
        text1 = json_data['sentences'][0]['0']['match']
        tree = nltk.Tree.fromstring(text1, read_leaf=lambda x: x.split("/")[0])
        line = tree.leaves()

        text_pos = CoreNLPPOSTagger(url='http://localhost:9000').tag(text.split())
        c = 0
        for tagg in text_pos:
            #line = 'John loved Anne.'	
            if tagg[1]== "VBD" and text_pos[c][0]!= 'had' and text_pos[c+1][1]!='VBG':
                verb_tense = "did"
                root_verb = WordNetLemmatizer().lemmatize(tagg[0],'v')
                text = text.replace(tagg[0],root_verb)

            #line = 'John love Anne.'
            if tagg[1]=="VBP" and text_pos[c][0]!= 'is' and text_pos[c][0]!='are' and text_pos[c][0]!= 'have':
                verb_tense = "do"
                root_verb = WordNetLemmatizer().lemmatize(tagg[0],'v')
                text = text.replace(tagg[0],root_verb)
            
            #line = 'John loves Anne.'
            if tagg[1]=="VBZ" and text_pos[c+1][1]!='VBN' and text_pos[c+1][1]!='VBG':
                verb_tense = "does"
                root_verb = WordNetLemmatizer().lemmatize(tagg[0],'v')
                text = text.replace(tagg[0],root_verb)

            #line = 'John is playing with Anne'
                    #who is john playig with?
            #line = 'John was playing with Anne.'
                    #who was john playing with?
            #line = 'John is going to play with Anne.'
                    #who is john going to play with?

            if tagg[1]=="VBG" and text_pos[c-1][1]!='VB' and text_pos[c-1][1]!='VBN':
                verb_tense = text_pos[c-1][0]
                text = text.replace(text_pos[c-1][0]+" ","")


            #line = 'John has loved Anne.'	
            if tagg[1]=="VBZ" and text_pos[c+1][1]=='VBN' and text_pos[c+2][1]!="VBG":
                verb_tense = text_pos[c][0]
                text = text.replace(text_pos[c][0]+" ","")

            #line = 'John will be playing with Anne.'
            if tagg[1]=="VBG" and text_pos[c-1][1]=='VB':
                verb_tense = text_pos[c-2][0]
                text = text.replace(text_pos[c-2][0]+" ","")

            #line = 'John has been playing with Anne.'
            #line = 'John had been playing with Anne.'
            if (tagg[1]=="VBZ" or tagg[1]=="VBD") and text_pos[c+1][1]=='VBN' and text_pos[c+2][1]== 'VBG':
                verb_tense = text_pos[c][0]
                text = text.replace(text_pos[c][0]+" ","")

            #line = 'John had loved Anne.'
            #line = 'We have loved Anne.'
            if tagg[1]=="VBN" and tagg[0]!='been' and (text_pos[c-1][0]== 'had' or text_pos[c-1][0]== 'have') and text_pos[c-2][1]!='MD':
                verb_tense = text_pos[c-1][0]
                text = text.replace(text_pos[c-1][0]+" ","")

            #line = 'John will have played with Anne.'
            #line = 'John will love Anne.'
            #line = 'John would have loved Anne.'
            if tagg[1]=="MD" and text_pos[c+1][1]=='VB':
                verb_tense = text_pos[c][0]
                text = text.replace(text_pos[c][0]+" ","")
            c  = c + 1


        obj = ' '.join(line)
        classified_text = CoreNLPNERTagger(url='http://localhost:9000').tag(obj.split())
        f = 1
        for i in classified_text:
            if i[1]!='PERSON':
                f = 0
                break

        text = text.replace(obj,'')
        text = text.replace("."," ?")
        Q = 'Who '+verb_tense+' '+text
        return Q