示例#1
0
class TestReader(unittest.TestCase):
    def setUp(self):
        self.reader = Reader()
        self.tokenized_text = word_tokenize(TEXT)
        self.classified_text = self.reader.st.tag(self.tokenized_text)

    def test_init(self):
        assert TEST1 == self.classified_text

    def test_read_files(self):
        self.lst_news = self.reader.read_files("data/bbc")
        self.assertFalse(len(self.reader.file_names) == 0)
        self.assertTrue(os.access(self.reader.file_names[0], os.R_OK))

    def test_parse_news(self):
        self.lst_news = self.reader.read_files("data/bbc")
        # test on a subset of news articles, e.g. 10 files
        res = self.reader.parse_news(self.lst_news[:10])

    def test_filter_stop_words(self):
        example = [
            'This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off',
            'the', 'stop', 'words', 'filtration', '.'
        ]
        res = self.reader.filter_stop_words(example)
        print(res)
        assert res == [
            'sample', 'sentence', 'showing', 'stop', 'words', 'filtration'
        ]

    def test_stem_words(self):
        example = ['game', 'gaming', 'gamed', 'games']
        res = self.reader.stem_words(example)
        assert res == ['game']
示例#2
0
class Graph(object):
    """
    Base class which represents the heterogeneous textual graph (undirected graph). G = <V, E>. V is the set of nodes (objects), including 3 types of objects (i.e. new entites, known entities, and contextual words). Entities are words (with label: PERSON, LOCATION, and ORGANIZATION) whereas contextual words are the remaining uni-gram words. New entities are the entities not in DBpedia, and Know entities are the entities in the DBpedia. E is a set of edges (co-occurrences) of entity-entity, entity-word, and word-word corrences. Words within every 5-word sliding window in a news sentence are considered to be co-occuring with each other. The weights are represented by adjacency matrix using dataframe.

    Attributes:
        nodes: dictionary of nodes {"N (new entity)": [(word, label)], "K (Known entity)": [(word, label)], "C (Contextual word)": [(word, label)]} in the graph; Includes 3 types of objects (i.e. e new entites, known entities, and contextual words).
        edges: set contains the tuples. e.g. ("A", "B") indicates a link between node "A" and node "B".
        weights: The weights are represented by adjacency matrix using dataframe.
        news: list of news articles (articles are string type).
    """
    def __init__(self, lst_news, window_size=5):
        """Inits Graph
        Args:
            lst_news: list of string. list of news articles.
        """
        self.window_size = window_size
        self.news = list(lst_news)
        self.reader = Reader()
        self.search = Search()

        self.nodes = self.__create_nodes()
        self.edges = self.__create_edges()
        self.edge_weights = self.__create_weights()

    def __create_nodes(self):
        """Private class method
        Takes in a list of news articles (articles are string types):
        1) tokenize the articles
        2) remove stopwords
        3) label words with 3 labels (i.e. PERSON, ORGANIZATION, LOCATION)
        4) Match entities (i.e. person, org, loc) against DBpedia

        Returns:
            Returns a dictionary contains 3 types of objects (i.e. new entites, known entities, and contextual words). E.g. {"N": [("Washington", "LOCATION")], "K":[("Trump", "PERSON"), ("Hua Wei", "ORGANIZATION")], "C": [("the", "O"), ("am", "O")]}
        """

        # parse news articles
        tagged_words = self.reader.parse_news(self.news)
        # seperate entities from contextual words
        entities, cwords = self.__entities_words(tagged_words)
        new_e, known_e = self.search.query(entities)

        ret = dict()
        ret["N"] = list(set(new_e))
        ret["K"] = list(set(known_e))
        ret["C"] = list(set(cwords))

        return dict(ret)

    def get_nodes(self):
        """
        Getter method which returns all nodes from self.nodes.
        """
        ret = set()
        for i in self.nodes["N"]:
            ret.add(i[0])
        for i in self.nodes["K"]:
            ret.add(i[0])
        for i in self.nodes["C"]:
            ret.add(i[0])
        return list(ret)

    def get_entities(self):
        """
        Getter method which returns a list of entities (i.e. word tagged with "PERSON", "LOCATION", "ORGANIZATION") from self.nodes.
        """
        ret = set()
        for i in self.nodes["N"]:
            ret.add(i[0])
        for i in self.nodes["K"]:
            ret.add(i[0])
        return list(ret)

    def get_words(self):
        """
        Getter method which returns a list of contextual words from self.nodes.
        """
        ret = set()
        for i in self.nodes["C"]:
            ret.add(i[0])
        return list(ret)

    def __create_edges(self, window_size=5):
        """Private class method
        Takes in a list of news articles, and extract the co-occurring links between nodes. Nodes within 5-word sliding window in a news sentence are considered to be co-occuring with each other. The frequncies of nodes co-appearing in news sentences as weights of these links.  

        Returns:
            Returns a set of links between nodes. 
        """
        e = set()
        for article in self.news:
            self.tokenized_text = word_tokenize(article)
            self.tokenized_text = self.reader.filter_stop_words(
                self.tokenized_text)
            generator = self.sliding_window(self.tokenized_text,
                                            self.window_size)
            for t in generator:
                e = e.union(set(itertools.combinations(t, 2)))
        return set(e)

    def get_edges(self):
        """
        Getter method which returns a set of edges from self.edges.
        """
        return set(self.edges)

    def sliding_window(self, seq, n=5):
        """
        Returns a sliding window (of width n) over data from the iterable
           s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...

        Args:
            seq: list of words; This is one news article splitted into a list of words.
            n: int; size of the sliding window

        Returns:
            An iterator contains all the sliced window. See the test case in `tests/test_graph.py` for more details.
        """
        it = iter(seq)
        result = tuple(islice(it, n))
        if len(result) <= n:
            yield result
        for elem in it:
            result = result[1:] + (elem, )
            yield result

    def __create_weights(self):
        """Private class method
        Create weights matrix using pandas dataframe. The value at ith row and jth row is the counts of links (undirected) between node i and node j.
        
        Returns:
            Return a copy of dataframe representing the weights matrix.
        """
        words = self.get_nodes()
        df = pd.DataFrame(index=words, columns=words).fillna(0)

        for article in self.news:
            self.tokenized_text = word_tokenize(article)
            self.tokenized_text = self.reader.filter_stop_words(
                self.tokenized_text)
            generator = self.sliding_window(self.tokenized_text,
                                            self.window_size)

            for t in generator:
                for tup in set(itertools.combinations(t, 2)):
                    if tup[0] != tup[1]:
                        df.loc[tup[0], tup[1]] += 1
                        df.loc[tup[1], tup[0]] += 1

        return df.copy()

    def get_weights(self):
        return self.edge_weights.copy()

    def __entities_words(self, tagged_words):
        """Private class method
        Seperate the entity words from the comtextual words.

        Args:
            tagged_words: list of strings; a list of tuples (word, label)

        Returns:
            entities: words tagged with "PERSON", "LOCATION", "ORGANIZATION".
            cwords: words tagged with "O" 
        """
        entities = list()
        cwords = list()
        for word in tagged_words:
            if word[1] == "O":
                # contextual words
                cwords.append(word)
            else:
                entities.append(word)
        assert len(entities) + len(cwords) == len(tagged_words)
        return entities, cwords

    def update_weight(self, e, w):
        """
        Update the edge weight in the enternal weight matrix. 
        Args:
            e: tuple; a tuple contains two nodes, e.g. ("A", "B")
            w: int; The new weight associated with e
        """
        if e[0] not in set(self.get_nodes()):
            raise ValueError("Node {} is not in the graph".format(str(e[0])))
        if e[1] not in set(self.get_nodes()):
            raise ValueError("Node {} is not in the graph".format(str(e[1])))

        if e in self.edges and w <= 0:
            self.edge_weights.loc[e[0], e[1]] = w
            self.edge_weights.loc[e[1], e[0]] = w
            self.edges.remove(e)
            self.edges.remove((e[1], e[0]))
        elif e in self.edges and w > 0:
            self.edge_weights.loc[e[0], e[1]] = w
            self.edge_weights.loc[e[1], e[0]] = w
        else:
            self.edge_weights.loc[e[0], e[1]] = w
            self.edge_weights.loc[e[1], e[0]] = w
            self.edges.add(e)