Пример #1
0
 def __init__(self, reader, k=None):
     self.reader = reader
     self.datatxt = DataTXT()
     self.snippets = []
     self.pages = []
     self.topic_set = defaultdict(int)
     self.rel_matrix = []
     self.k = k
     self.topics = []
     seed((1000, 2000))
Пример #2
0
 def __init__(self, items):
     self.texts = [x[0] for x in items]
     self.urls = [x[1] for x in items]
     self.user = [x[2] for x in items]
     self.datatxt = DataTXT()
Пример #3
0
class Annotator(object):
    def __init__(self, items):
        self.texts = [x[0] for x in items]
        self.urls = [x[1] for x in items]
        self.user = [x[2] for x in items]
        self.datatxt = DataTXT()

    def annotate(self, test=None):
        """
        >>> a = Annotator([('mozilla funziona google chrome', '', '')])
        >>> a.annotate(test="annotations")
        [u'http://en.wikipedia.org/wiki/Google_Chrome', u'http://en.wikipedia.org/wiki/Mozilla']
        >>> a = Annotator([('@mozilla funziona google chrome', '', '')])
        >>> a.annotate(test="annotations")
        [u'http://en.wikipedia.org/wiki/Google_Chrome']
        >>> a = Annotator([('mozilla funziona @google_chrome', '', '')])
        >>> a.annotate(test="annotations")
        [u'http://en.wikipedia.org/wiki/Mozilla']
        >>> a = Annotator([('@mozilla funziona @google_chrome', '', '')])
        >>> a.annotate(test="annotations")
        []
        >>> a = Annotator([('@google funziona http://google.com', '', '')])
        >>> a.annotate(test="annotations")
        []
        """
        t_set = set()
        tweets = defaultdict(list)

        annotated_texts_tmp = []
        rew = "((https?|ftp)://|(www|ftp)\.)[a-z0-9-]+(\.[a-z0-9-]+)+([/?].*)?"

        for i, text in enumerate(self.texts):
            text_ann = []
            for x in text.split(' '):
                if x.startswith('@'):             # username
                    x = "_" * len(x)
                else:                             # website
                    match = re.search(rew, x)
                    if match:
                        x = "_" * len(x)

                text_ann.append(x)

            text_ann = u" ".join(text_ann)
            annotation = self.datatxt.nex(text_ann)

            if annotation is None:
                continue

            d = {
                'text': text,
                'url': self.urls[i],
                'user': self.user[i],
                'annotations': annotation['annotations'].values(),
            }

            nice_page = {}
            for topics, ann in annotation['annotations'].items():
                page = topics
                if '://it.' in topics:  # italian entity
                    page = self.datatxt.interWikiRecon.get_inter_wikilinks(
                        topics).get('EN')

                    if page is None:
                        page = topics
                    else:
                        nice_page[page] = ann['confidence']
                else:
                    nice_page[page] = ann['confidence']

                tweets[page].append(d)

            del annotation['annotations']
            annotation['annotations'] = nice_page
            annotated_texts_tmp.append(annotation)

        annotated_texts = {
            x['id']: x for x in annotated_texts_tmp if x is not None
        }

        for k, v in annotated_texts.iteritems():
            del v['id']

        if test == "annotations":
            return annotated_texts.values()[0]['annotations'].keys()

        return annotated_texts, tweets
Пример #4
0
class BaseClusterify(object):
    def __init__(self, reader, k=None):
        self.reader = reader
        self.datatxt = DataTXT()
        self.snippets = []
        self.pages = []
        self.topic_set = defaultdict(int)
        self.rel_matrix = []
        self.k = k
        self.topics = []
        seed((1000, 2000))

    def annotate(self):
        annotator = Annotator(self.reader.texts())
        self.snippets, tweets = annotator.annotate()
        return tweets

    def _generate_topic_set(self):
        logger.info("snippet set")
        logger.info(self.snippets)
        for _, snippet in self.snippets.iteritems():
            for page, _ in snippet.get('annotations').iteritems():
                self.topic_set[page] += 1.

    def _generate_adjagent_matrix(self):
        self._generate_topic_set()

        logger.info("topic set")
        logger.info(self.topic_set)

        topics = self.topic_set.keys()
        rel = zeros((len(topics), len(topics)))

        BATCH_SIZE = 10
        for offsetX in xrange(0, len(topics), BATCH_SIZE):
            topicsX = topics[offsetX: offsetX + BATCH_SIZE]
            for offsetY in xrange(offsetX, len(topics), BATCH_SIZE):
                topicsY = topics[offsetY: offsetY + BATCH_SIZE]
                rel_values = self.datatxt.rel(topicsX, topicsY)
                for i in xrange(0, len(topicsX)):
                    for j in xrange(0, len(topicsY)):
                        rel[offsetX + i][offsetY + j] = rel_values[i][j]
                        rel[offsetY + j][offsetX + i] = rel_values[i][j]

        self.rel_matrix = rel

        return rel

    def _generate_cluster(self, ids):
        response = {}

        for i, id in enumerate(ids):
            if id in response:
                response[id].append(list(self.topic_set)[i])
            else:
                response[id] = [list(self.topic_set)[i]]

        return response

    def _generate_output_response(self, response):
        if not self.topic_set:
            self._generate_topic_set()

        response_dict = []

        for cluster in response.values():
            tmp = {}
            for topic in cluster:
                tmp[topic] = self.topic_set[topic] ** .5
            response_dict.append(tmp)

        return {
            'clusters': response_dict
        }

    def do_cluster(self):
        raise NotImplemented