Пример #1
0
def add_polarity_to_synsets(id_words, _state_queue=None, _id_process=None):
    """
    Adds the positive/negative/objective polarities of all the synsets currently in the table
    Synset, from the SentiWordNet corpus.

    .. note:: This function should be used only inside the :func:`file_process.add_files()` function.

    """

    from nltk.corpus import sentiwordnet as swn
    from loacore.load import synset_load
    from loacore.utils.db import safe_commit, safe_execute
    from loacore.conf import DB_TIMEOUT

    conn = sql.connect(DB_PATH, timeout=DB_TIMEOUT)
    c = conn.cursor()

    synsets = synset_load.load_synsets(
        id_synsets=synset_load.get_id_synsets_for_id_words(id_words))

    synset_count = 0
    total_synset = len(synsets)
    for synset in synsets:
        # Print state
        synset_count += 1
        _commit_polarity_state(_state_queue, _id_process, synset_count,
                               total_synset)
        synset.pos_score = swn.senti_synset(synset.synset_name).pos_score()
        if synset.pos_score is not None:
            # There is an entry in the SentiWordNet database for our synset
            synset.neg_score = swn.senti_synset(synset.synset_name).neg_score()
            synset.obj_score = 1 - (synset.pos_score + synset.neg_score)

            safe_execute(
                c, "UPDATE Synset SET (Pos_Score, Neg_Score, Obj_Score) "
                "= (" + str(synset.pos_score) + ", " + str(synset.neg_score) +
                ", " + str(synset.obj_score) + ") "
                "WHERE Id_Synset = " + str(synset.id_synset), 0, _state_queue,
                _id_process)

    if _state_queue is None:
        print("")
    safe_commit(conn, 0, _state_queue, _id_process)

    conn.close()
Пример #2
0
def add_synsets_to_sentences(sentences,
                             print_synsets=False,
                             _state_queue=None,
                             _id_process=None,
                             freeling_modules=None):
    """
    Performs a Freeling process to disambiguate words of the sentences according to their context
    (UKB algorithm) linking them to a unique synset (if possible).\n
    Our sentences are converted to Freeling Sentences before processing.\n
    Notice that even if we may have already computed the Lemmas for example, Freeling Sentences generated from our
    sentences are "raw sentences", without any analysis linked to their Words. So we make all the Freeling
    process from scratch every time, except *tokenization* and *sentence splitting*, to avoid any confusion.

    .. note:: This function should be used only inside the file_process.add_files() function.

    :param sentences: Sentences to process
    :type sentences: :obj:`list` of |Sentence|
    :param print_synsets: If True, print disambiguation results
    :type print_synsets: boolean
    """

    from loacore.conf import DB_TIMEOUT
    from loacore.utils.db import safe_commit, safe_execute

    freeling_sentences = [
        sentence.compute_freeling_sentence() for sentence in sentences
    ]

    if freeling_modules is None:
        if _state_queue is not None:
            _state_queue.put(
                ProcessState(_id_process, os.getpid(), "Loading Freeling...",
                             " - "))
        morfo, tagger, sen, wsd = init_freeling()
    else:
        morfo, tagger, sen, wsd = freeling_modules

    _disambiguation_state(_state_queue, _id_process)
    # perform morphosyntactic analysis and disambiguation
    processed_sentences = morfo.analyze(freeling_sentences)
    processed_sentences = tagger.analyze(processed_sentences)
    # annotate and disambiguate senses
    processed_sentences = sen.analyze(processed_sentences)
    processed_sentences = wsd.analyze(processed_sentences)

    # Copy freeling results into our Words
    for s in range(len(sentences)):
        sentence = sentences[s]

        if not len(sentence.words) == len(processed_sentences[s]):
            print("/!\\ Warning, sentence offset error in synset_process /!\\")
            print(sentence.sentence_str())
            print([w.get_form() for w in processed_sentences[s]])

        for w in range(len(sentence.words)):
            word = sentence.words[w]
            rank = processed_sentences[s][w].get_senses()
            if len(rank) > 0:
                if not rank[0][0][0] == '8':
                    # ignore synsets offsets 8.......-.
                    # they are odd synsets that WordNet can't find...
                    word.synset = Synset(None, word.id_word, rank[0][0],
                                         wn.of2ss(rank[0][0]).name(), None,
                                         None, None)
                if print_synsets:
                    print("Word : " + word.word)
                    print("Synset code : " + rank[0][0])
                    print("Synset name : " + wn.of2ss(rank[0][0]).name())

    # Add synsets to database

    conn = sql.connect(DB_PATH, timeout=DB_TIMEOUT)
    c = conn.cursor()

    sentence_count = 0
    total_sentence = len(sentences)
    for sentence in sentences:
        # Print state
        sentence_count += 1
        _commit_state(_state_queue, _id_process, sentence_count,
                      total_sentence)

        for word in sentence.words:
            synset = word.synset

            if synset is not None:
                # Add synset

                safe_execute(
                    c,
                    "INSERT INTO Synset (ID_Word, Synset_Code, Synset_Name) "
                    "VALUES (?, ?, ?)",
                    0,
                    _state_queue,
                    _id_process,
                    mark_args=(word.id_word, synset.synset_code,
                               synset.synset_name))

                # Get back id of last inserted review
                safe_execute(c, "SELECT last_insert_rowid()", 0, _state_queue,
                             _id_process)
                id_synset = c.fetchone()[0]

                # Update Word table
                safe_execute(
                    c, "UPDATE Word SET ID_Synset = " + str(id_synset) +
                    " WHERE ID_Word = " + str(word.id_word), 0, _state_queue,
                    _id_process)

    safe_commit(conn, 0, _state_queue, _id_process)

    conn.close()
Пример #3
0
def add_sentences_from_reviews(reviews,
                               _state_queue=None,
                               _id_process=None,
                               freeling_modules=None):
    """

    Performs the first Freeling process applied to each normalized review.\n
    Each review is tokenized, and then splitted into sentences, thanks to corresponding Freeling modules.\n
    A representation of the Sentences and their Words (tokens) are then added to corresponding tables.

    .. note:: This function should be used only inside the :func:`file_process.add_files()` function.

    :param reviews: Reviews to process
    :type reviews: :obj:`list` of |Review|
    :return: added sentences
    :rtype: :obj:`list` of |Sentence|
    """
    from loacore.classes.classes import Word
    from loacore.utils.db import safe_commit, safe_execute
    from loacore.conf import DB_TIMEOUT

    if freeling_modules is None:
        if _state_queue is not None:
            _state_queue.put(
                ProcessState(_id_process, os.getpid(), "Loading Freeling...",
                             " - "))
        morfo, tk, sp = init_freeling()
    else:
        morfo, tk, sp = freeling_modules

    conn = sql.connect(DB_PATH, timeout=DB_TIMEOUT)
    c = conn.cursor()

    added_sentences = []
    review_count = 0
    try:
        total_review = len(reviews)
    except TypeError:
        # Review is a ReviewIterator, unkown length.
        total_review = " - "

    for review in reviews:

        # Print state
        review_count += 1
        _tokenization_state(_state_queue, _id_process, review_count,
                            total_review)

        raw_review = review.review
        tokens = tk.tokenize(raw_review)
        sentences = sp.split(tokens)
        sentences = morfo.analyze(sentences)

        review_index = 0

        for sentence in sentences:

            if len(sentence) <= 50:
                review_sentence = Sentence(None, review.id_review,
                                           review_index, None)

                review_index += 1

                # Add words
                sentence_index = 0
                for word in sentence:
                    review_sentence.words.append(
                        Word(None, None, sentence_index, word.get_form(), None,
                             None, None))
                    sentence_index += 1

                review.sentences.append(review_sentence)

    sentence_count = 0
    total_sentence = len([s for r in reviews for s in r.sentences])
    for r in reviews:
        for s in r.sentences:

            # Print state
            sentence_count += 1
            _commit_state(_state_queue, _id_process, sentence_count,
                          total_sentence)

            # Add sentence
            safe_execute(c, "INSERT INTO Sentence (ID_Review, Review_Index) "
                         "VALUES (?, ?)",
                         0,
                         _state_queue,
                         _id_process,
                         mark_args=(s.id_review, s.review_index))

            # Get back id of last inserted sentence
            safe_execute(c, "SELECT last_insert_rowid()", 0, _state_queue,
                         _id_process)
            id_sentence = c.fetchone()[0]
            s.id_sentence = id_sentence

            sql_words = []
            for w in s.words:
                w.id_sentence = id_sentence
                sql_words.append((id_sentence, w.sentence_index, w.word))
                safe_execute(
                    c,
                    "INSERT INTO Word (ID_Sentence, Sentence_Index, word) VALUES (?, ?, ?)",
                    0,
                    _state_queue,
                    _id_process,
                    mark_args=sql_words,
                    execute_many=True)
            added_sentences.append(s)

    if _state_queue is None:
        print("")

    safe_commit(conn, 0, _state_queue, _id_process)

    conn.close()

    return added_sentences
Пример #4
0
def add_dep_tree_from_sentences(sentences,
                                print_result=False,
                                _state_queue=None,
                                _id_process=None,
                                freeling_modules=None):
    """
    Generates the dependency trees of the specified sentences and add the results to the
    database.\n
    Sentences are firstly converted into "raw" Freeling sentences (without any analysis) and then all the necessary
    Freeling processes are performed.\n
    The PoS_tag of words are also computed and added to the database in this function.\n

    .. note:: This function should be used only inside the :func:`file_process.add_files()` function.

    .. note:: This process can be quite long. (at least a few minutes)

    :param sentences: Sentences to process
    :type sentences: :obj:`list` of |Sentence|
    :param print_result: Print PoS_tags and labels associated to each |Word|
    :type print_result: boolean
    """

    from loacore.utils.db import safe_commit, safe_execute
    from loacore.conf import DB_TIMEOUT

    if freeling_modules is None:
        if _state_queue is not None:
            _state_queue.put(
                ProcessState(_id_process, os.getpid(), "Loading Freeling...",
                             " - "))
        morfo, tagger, sen, wsd, parser = init_freeling()
    else:
        morfo, tagger, sen, wsd, parser = freeling_modules

    freeling_sentences = [
        sentence.compute_freeling_sentence() for sentence in sentences
    ]

    # Print state
    _parsing_state(_state_queue, "DT Tagging...", _id_process)

    # perform morphosyntactic analysis
    processed_sentences = morfo.analyze(freeling_sentences)
    processed_sentences = tagger.analyze(processed_sentences)

    # Print state
    _parsing_state(_state_queue, "DT Disambiguation...", _id_process)

    # annotate and disambiguate senses
    processed_sentences = sen.analyze(processed_sentences)
    processed_sentences = wsd.analyze(processed_sentences)

    # Print state
    _parsing_state(_state_queue, "Dep Tree Parsing...", _id_process)
    # Dependency tree parsing
    processed_sentences = parser.analyze(processed_sentences)

    conn = sql.connect(DB_PATH, timeout=DB_TIMEOUT)
    c = conn.cursor()

    sentence_count = 0
    total_sentence = len(sentences)
    for s in range(len(sentences)):
        # Print State
        sentence_count += 1
        _commit_state(_state_queue, _id_process, sentence_count,
                      total_sentence)

        sentence = sentences[s]

        # Add dep_tree to database
        dt = processed_sentences[s].get_dep_tree()
        dep_tree = DepTree(None, None, sentence.id_sentence)

        safe_execute(c,
                     "INSERT INTO Dep_Tree (ID_Sentence) VALUES (?)",
                     0,
                     _state_queue,
                     _id_process,
                     mark_args=[dep_tree.id_sentence])

        # Get back id_dep_tree
        safe_execute(c, "SELECT last_insert_rowid()", 0, _state_queue,
                     _id_process)

        id_dep_tree = c.fetchone()[0]
        dep_tree.id_dep_tree = id_dep_tree

        # Database process
        root = None
        if not len(sentence.words) == len(processed_sentences[s]):
            print(
                "/!\\ Warning, sentence offset error in deptree_process /!\\")
            print(sentence.sentence_str())
            print([w.get_form() for w in processed_sentences[s]])

        for w in range(len(sentence.words)):
            word = sentence.words[w]
            rank = processed_sentences[s][w].get_senses()
            if len(rank) > 0:
                word.PoS_tag = processed_sentences[s][w].get_tag()
                if print_result:
                    print("Word : " + word.word)
                    print("PoS_tag : " + processed_sentences[s][w].get_tag())
                    print("Label : " + dt.get_node_by_pos(w).get_label())

            # We use the get_node_by_pos function to map the tree to our sentence
            node = dt.get_node_by_pos(w)

            dep_tree_node = DepTreeNode(None, id_dep_tree, word.id_word,
                                        node.get_label(), 0)
            if node == dt.begin():
                dep_tree_node.root = 1
                root = dep_tree_node

            # Add DepTreeNode to database
            safe_execute(
                c,
                "INSERT INTO Dep_Tree_Node (ID_Dep_Tree, ID_Word, Label, root) "
                "VALUES (?, ?, ?, ?)",
                0,
                _state_queue,
                _id_process,
                mark_args=(dep_tree_node.id_dep_tree, dep_tree_node.id_word,
                           dep_tree_node.label, dep_tree_node.root))

            # Get back id_dep_tree_node
            safe_execute(c, "SELECT last_insert_rowid()", 0, _state_queue,
                         _id_process)

            id_dep_tree_node = c.fetchone()[0]

            dep_tree_node.id_dep_tree_node = id_dep_tree_node

            # Use the freeling set_node_id function to store our db node id in the freeling node
            node.set_node_id(str(id_dep_tree_node))

            # Add PoS_tag to Word
            if word.PoS_tag is not None:
                safe_execute(
                    c, "UPDATE Word SET PoS_tag = '" + word.PoS_tag + "' "
                    "WHERE ID_Word = " + str(word.id_word), 0, _state_queue,
                    _id_process)

        # Add dep_tree root to database
        dep_tree.root = root
        safe_execute(
            c, "UPDATE Dep_Tree SET ID_Dep_Tree_Node = " +
            str(root.id_dep_tree_node) + " "
            "WHERE ID_Dep_Tree = " + str(id_dep_tree), 0, _state_queue,
            _id_process)

        # Add children relations
        root_node = dt.begin()
        _rec_children(c, root_node, _state_queue, _id_process)

    if _state_queue is None:
        print("")

    safe_commit(conn, 0, _state_queue, _id_process)

    conn.close()
Пример #5
0
def add_lemmas_to_sentences(sentences,
                            print_lemmas=False,
                            _state_queue=None,
                            _id_process=None,
                            freeling_modules=None):
    """

    Performs a Freeling process to add lemmas to words.\n
    However, the argument is actually a sentence to better fit Freeling usage.\n
    Our sentences will be converted to a Freeling Sentences before processing.

    .. note:: This function should be used only inside the :func:`file_process.add_files()` function.

    :param sentences: Sentences to process
    :type sentences: :obj:`list` of |Sentence|
    :param print_lemmas: If True, print lemmatization results
    :type print_lemmas: boolean
    """
    from loacore.utils.db import safe_commit, safe_execute
    from loacore.conf import DB_TIMEOUT

    freeling_sentences = [
        sentence.compute_freeling_sentence() for sentence in sentences
    ]

    if freeling_modules is None:
        if _state_queue is not None:
            _state_queue.put(
                ProcessState(_id_process, os.getpid(), "Loading Freeling...",
                             " - "))
        morfo = init_freeling()
    else:
        morfo = freeling_modules

    # Print sentence
    _lemmatization_state(_state_queue, _id_process)

    processed_sentences = morfo.analyze(freeling_sentences)

    # Copy freeling results into our Words
    for s in range(len(sentences)):
        sentence = sentences[s]

        if not len(sentence.words) == len(processed_sentences[s]):
            print("/!\\ Warning, sentence offset error in lemma_process /!\\")
            print(sentence.sentence_str())
            print([w.get_form() for w in processed_sentences[s]])

        for w in range(len(sentence.words)):
            word = sentence.words[w]
            word.lemma = processed_sentences[s][w].get_lemma()
            if print_lemmas:
                print(word.word + " : " + word.lemma)

    # Add lemmas to database
    conn = sql.connect(DB_PATH, timeout=DB_TIMEOUT)
    c = conn.cursor()

    sentence_count = 0
    total_sentence = len(sentences)
    _commit_state(_state_queue, _id_process, " - ", " - ")
    for sentence in sentences:
        # Print state
        sentence_count += 1
        _commit_state(_state_queue, _id_process, sentence_count,
                      total_sentence)

        for word in sentence.words:
            # Add Lemma to Lemma Table
            safe_execute(c,
                         "INSERT INTO Lemma (Lemma, ID_Word) VALUES (?, ?)",
                         0,
                         _state_queue,
                         _id_process,
                         mark_args=(word.lemma, word.id_word))

            # Get back id of last inserted lemma
            safe_execute(c, "SELECT last_insert_rowid()", 0, _state_queue,
                         _id_process)
            id_lemma = c.fetchone()[0]

            # Update Word table
            safe_execute(
                c, "UPDATE Word SET ID_Lemma = " + str(id_lemma) +
                " WHERE ID_Word = " + str(word.id_word), 0, _state_queue,
                _id_process)

    if _state_queue is None:
        print("")
    safe_commit(conn, 0, _state_queue, _id_process)

    conn.close()