Exemplo n.º 1
0
    def format(self, raw_dataset_path, target_dataset_dir, is_save=True):
        total_mentions, nme_mentions, NIL_mentions = [], [], []

        mentions = []  # type: List[List[tuple]]
        docs, xlore_misses, valid_entities = [], [], []
        with open(raw_dataset_path, "r", encoding="utf-8") as rf:
            doc_mentions = []  # type: List[List]
            doc = ""
            for line in rf:
                if line.startswith("-DOCSTART-"):
                    doc = doc.strip()
                    if len(doc) > 0:
                        docs.append(doc)
                        mentions.append(doc_mentions)
                    doc, doc_mentions = "", []
                elif len(line.strip()) == 0: # 如果为空
                    doc = doc.strip(' ')
                    doc += "\n"
                elif line.strip() in string.punctuation: # 如果是符号
                    doc = doc.strip(' ')
                    doc += line.strip()
                else:
                    line_arr = line.strip().split("\t")
                    if len(line_arr) > 1:
                        token, flag, mention_label, yago_id = line_arr[0], line_arr[1], line_arr[2], line_arr[3]
                        if flag == 'B':
                            total_mentions.append(mention_label)
                            mention = Mention(len(doc), len(doc) + len(mention_label), mention_label)
                            if yago_id != '--NME--':
                                wiki_url = line_arr[4][23:]
                                entity = self.entity_manager.entity_dictionary.get_entity_from_uri(wiki_url) # type: Entity
                                if entity is not None:
                                    valid_entities.append(wiki_url)
                                    mention.set_gold_entity(entity)
                                    doc_mentions.append((mention.start, mention.end, mention.label, mention.gold_entity.ID))
                                else:
                                    NIL_mentions.append(mention_label)
                                    doc_mentions.append((mention.start, mention.end, mention.label, 'NIL'))
                                    xlore_misses.append(wiki_url)
                            else:
                                NIL_mentions.append(mention_label)
                                nme_mentions.append(mention.label)
                                doc_mentions.append((mention.start, mention.end, mention.label, "NIL"))
                        if flag != 'I':
                            doc += mention_label + ' '
                    else:
                        doc += line_arr[0] + " "
            if len(doc_mentions) > 0:
                mentions.append(doc_mentions)
                docs.append(doc)

        if is_save:
            json.dump(mentions, open(os.path.join(target_dataset_dir, "annotations.json"), "w", encoding="utf-8"), indent=4, ensure_ascii=False)
            json.dump(docs, open(os.path.join(target_dataset_dir, "docs.json"), "w", encoding="utf-8"), indent=4, ensure_ascii=False)
            json.dump(xlore_misses, open(os.path.join(target_dataset_dir, "xlore_misses.json"), "w", encoding="utf-8"), indent=4, ensure_ascii=False)
            json.dump(valid_entities, open(os.path.join(target_dataset_dir, "valid_entities.json"), "w", encoding="utf-8"), indent=4, ensure_ascii=False)

        self.report_result(docs, total_mentions, NIL_mentions, xlore_misses, nme_mentions)

        return total_mentions, NIL_mentions, xlore_misses, nme_mentions
Exemplo n.º 2
0
def process_mentions(message):
    mentions = re.findall(
        '@(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
        message.message)
    for mention in mentions:
        new_mention = Mention(name=mention, message=message)
        new_mention.save()
Exemplo n.º 3
0
def test_get_kweek_mentions():
    query = " SELECT ID FROM KWEEK WHERE USERNAME = '******' LIMIT 1"
    kweek_id = db_manager.execute_query(query)[0]['id']
    actual_mention = actions.get_kweek_mentions(kweek_id)[0]
    expected_mention = Mention({'username': '******', 'indices': [25, 35]})
    assert isinstance(actual_mention, Mention)
    assert actual_mention.to_json() == expected_mention.to_json()

    query = "SELECT ID FROM KWEEK WHERE USERNAME = '******' LIMIT 1"
    kweek_id = db_manager.execute_query(query)[0]['id']
    mentions = actions.get_kweek_mentions(kweek_id)
    assert mentions == []
Exemplo n.º 4
0
def get_twitter_mentions():
    statuses = []
    for query in QUERIES:
        response = client.api.search.tweets.get(q='jeffknupp.com', count=100)
        statuses += response.data.statuses
    session = Session()
    twitter = session.query(Source).get(1)
    new_mentions = 0
    for status in statuses:
        if not session.query(Mention).filter(
                Mention.domain_id == status.id_str).count():
            created_at = datetime.datetime.strptime(
                status.created_at, r"%a %b %d %H:%M:%S +0000 %Y")
            m = Mention(text=status.text,
                        associated_user='******'.format(
                            status.user.screen_name,
                            status.user.followers_count),
                        recorded_at=datetime.datetime.now(),
                        occurred_at=created_at,
                        source=twitter,
                        domain_id=status.id_str)
            new_mentions += 1
            session.add(m)
    session.commit()
    return new_mentions
Exemplo n.º 5
0
def handle_mention(request_params, mention):
    mention = Mention(mention['action_text'], mention['message_id'],
                      mention['mentioned_by'], mention['full_message'])
    actions = get_actions()
    if actions != None:
        action = perform_action(actions, mention)
    response = mention_response(request_params, action.action_result,
                                mention.message_id)
    return (mention, action, response)
Exemplo n.º 6
0
def create_entity_mentions(mentions):
    mentions_object = []
    for m in mentions:
        mention = Mention(
           content=m.text.content, \
           begin_offset=m.text.begin_offset, \
           type=m.type, \
           sentiment_score=m.sentiment.score, \
           sentiment_magnitude=m.sentiment.magnitude )

        mentions_object.append(mention)
    return mentions_object
Exemplo n.º 7
0
def create_tweet():
    content = request.form['content']

    tweet = Tweet(content, g.user.id)
    db.session.add(tweet)
    db.session.commit()
    
    pattern = re.compile('@(\w+)')
    for name in pattern.findall(content):
        user = User.query.filter_by(username=name).first()
        if Mention.validate(user.id, tweet.id) and user is not g.user:
            mention = Mention(user.id, tweet.id)
            db.session.add(mention)
    db.session.commit()

    return redirect(url_for('timeline'))
Exemplo n.º 8
0
def prioritize(words):
    counters = {}
    for word in words:
        dict_match = DictionaryWord.query(
            DictionaryWord.word == word).fetch()
        if dict_match:
            mentions = Mention.query(Mention.word == dict_match[0].key).fetch()
            for mention in mentions:
                if mention.quote in counters:
                    counters[mention.quote] += 1
                else:
                    counters.update(dict([(mention.quote, 1)]))
    ordered = sorted(counters.items(), key=lambda x: x[1]).copy()
    logging.debug(" ====================================== ")
    logging.debug(ordered)
    logging.debug(" ====================================== ")
    return ordered
Exemplo n.º 9
0
 def create_mentions(self, asin, comment_attrs):
     logging.info(comment_attrs)
     logging.info(asin)
     subsite = self.find_or_create_subsite(comment_attrs['subsite_name'])
     logging.info("Subsite: " + str(subsite))
     comment_attrs['subsite_id'] = subsite.id
     comment = self.find_or_create_comment(comment_attrs)
     product = self.find_or_create_product(asin)
     logging.info(comment)
     logging.info(product)
     if product and comment:
         mention = session.query(Mention).filter_by(
             product_id=product.id, comment_id=comment.id).first()
         if not mention:
             mention = Mention(product_id=product.id, comment_id=comment.id)
             session.add(mention)
             session.commit()
Exemplo n.º 10
0
def parser(word):
    """ individual word into array of dicts
    of quotes """
    dict_match = DictionaryWord.query(
            DictionaryWord.word == word).fetch()
    if dict_match:
        mentions = Mention.query(Mention.word == dict_match[0].key).fetch()
        yeild_text = []
        for mention in mentions:
            quote = mention.quote.get()
            quote_dict = {'line':quote.line,
                        'context': quote.context,
                        'movie':quote.movie
                        }
            yeild_text.append(quote_dict)
        return yeild_text
    else:
        return []
Exemplo n.º 11
0
def get_kweek_mentions(kweek_id):
    """
        Gets the mentions in a given kweek.


        *Parameters:*
            - *kweek_id (int)*: The id of the kweek.

        *Returns:*
            - *List of models.Mention objects*
    """
    database_mentions = query_factory.get_kweek_mentions(kweek_id)
    mentions = []
    for database_mention in database_mentions:
        mention = {
            'username': database_mention['username'],
            'indices': [database_mention['starting_index'], database_mention['ending_index']]
        }
        mentions.append(Mention(mention))
    return mentions
Exemplo n.º 12
0
    def parse_text(self, text: str) -> List[Mention]:
        if not isThreadAttachedToJVM():
            attachThreadToJVM()

        parsed_result = self.solve_conflict(
            self.format_output(self.parser.parseText(text), text))

        mention_list = []  # type: List[Mention]
        for item in parsed_result:
            mention = Mention(int(item[0]), int(item[1]), item[2])
            mention.candidates = []
            for cand_id in item[3]:
                candidate = Candidate(cand_id)
                mention.add_candidate(candidate)
                mention.parse_from = self.param_config.name
            mention_list.append(mention)
        return mention_list
Exemplo n.º 13
0
    query: str = """SELECT COUNT(*) FROM HASHTAG """
    third_count = db_manager.execute_query(query)[0]['count']
    assert third_count - second_count == 0
    check, message = actions.insert_kweek(kweek_test_3)
    assert message == 'Repeated mention in the same kweek'
    check, message = actions.insert_kweek(kweek_test_4)
    assert message == 'the user mentioned does not exist in the database'




@pytest.mark.parametrize("text, expected_hashtags, expected_mentions",
                         [
                             ('#hashtag and @mention',
                              [Hashtag({'indices': (0, 8), 'text': '#hashtag', 'id': 0})],
                              [Mention({'indices': (13, 21), 'username': '******'})]),
                             ('#hashtag and @mention ',
                              [Hashtag({'indices': (0, 8), 'text': '#hashtag', 'id': 0})],
                              [Mention({'indices': (13, 21), 'username': '******'})]),
                             ('@mention and #hashtag',
                              [Hashtag({'indices': (13, 21), 'text': '#hashtag', 'id': 0})],
                              [Mention({'indices': (0, 8), 'username': '******'})]),
                             ('@mention and #hashtag ',
                              [Hashtag({'indices': (13, 21), 'text': '#hashtag', 'id': 0})],
                              [Mention({'indices': (0, 8), 'username': '******'})]),
                             ('@mention and # ',
                              [Hashtag({'indices': (13, 14), 'text': '#', 'id': 0})],
                              [Mention({'indices': (0, 8), 'username': '******'})]),
                             ('@mention and #',
                              [],
                              [Mention({'indices': (0, 8), 'username': '******'})]),
Exemplo n.º 14
0
def test_insert_kweek():
    kweek_test_1 = Kweek({
        'id': 0,
        'created_at': datetime.utcnow(),
        'text': '#test1',
        'media_url': None,
        'user': User({
            'username': '******',
            'screen_name': 'test1',
            'profile_image_url': 'image_url',
            'following': False,
            'follows_you': False,
            'muted': False,
            'blocked': False
        }),
        'mentions': [
            Mention({
                'username': '******',
                'indices': [10, 16]}),
            Mention({
                'username': '******',
                'indices': [18, 20]},
            )
        ],
        'hashtags': [
            Hashtag({
                'text': '#sky',
                'indices': [10, 16],
                'id': 0
            })
        ],
        'number_of_likes': 0,
        'number_of_rekweeks': 0,
        'number_of_replies': 0,
        'reply_to': None,
        'rekweek_info': None,
        'liked_by_user': False,
        'rekweeked_by_user': False
    })
    kweek_test_2 = Kweek({
        'id': 0,
        'created_at': datetime.utcnow(),
        'text': '#test2',
        'media_url': None,
        'user': User({
            'username': '******',
            'screen_name': 'test1',
            'profile_image_url': 'image_url',
            'following': False,
            'follows_you': False,
            'muted': False,
            'blocked': False
        }),
        'mentions': [
            Mention({
                'username': '******',
                'indices': [10, 16]}),
            Mention({
                'username': '******',
                'indices': [18, 20]})
        ],
        'hashtags': [
            Hashtag({
                'text': '#sky',
                'indices': [10, 16],
                'id': 0
            })
        ],
        'number_of_likes': 0,
        'number_of_rekweeks': 0,
        'number_of_replies': 0,
        'reply_to': None,
        'rekweek_info': None,
        'liked_by_user': False,
        'rekweeked_by_user': False
    })
    kweek_test_3 = Kweek({
        'id': 0,
        'created_at': datetime.utcnow(),
        'text': '#test3',
        'media_url': None,
        'user': User({
            'username': '******',
            'screen_name': 'test1',
            'profile_image_url': 'image_url',
            'following': False,
            'follows_you': False,
            'muted': False,
            'blocked': False
        }),
        'mentions': [
            Mention({
                'username': '******',
                'indices': [10, 16]}),
            Mention({
                'username': '******',
                'indices': [18, 20]},
            )
        ],
        'hashtags': [
            Hashtag({
                'text': '#sky',
                'indices': [10, 16],
                'id': 0
            })
        ],
        'number_of_likes': 0,
        'number_of_rekweeks': 0,
        'number_of_replies': 0,
        'reply_to': None,
        'rekweek_info': None,
        'liked_by_user': False,
        'rekweeked_by_user': False
    })
    kweek_test_4 = Kweek({
        'id': 0,
        'created_at': datetime.utcnow(),
        'text': '#test1',
        'media_url': None,
        'user': User({
            'username': '******',
            'screen_name': 'test1',
            'profile_image_url': 'image_url',
            'following': False,
            'follows_you': False,
            'muted': False,
            'blocked': False
        }),
        'mentions': [
            Mention({
                'username': '******',
                'indices': [10, 16]}),
            Mention({
                'username': '******',
                'indices': [18, 20]},
            )
        ],
        'hashtags': [
            Hashtag({
                'text': '#sky',
                'indices': [10, 16],
                'id': 0
            })
        ],
        'number_of_likes': 0,
        'number_of_rekweeks': 0,
        'number_of_replies': 0,
        'reply_to': None,
        'rekweek_info': None,
        'liked_by_user': False,
        'rekweeked_by_user': False
    })
    query: str = """SELECT ID FROM HASHTAG WHERE TEXT=%s  """
    data = ('#sky',)
    hid = db_manager.execute_query(query, data)
    if len(hid) != 0:
        query: str = """DELETE FROM HASHTAG WHERE ID=%s  """
        data = (hid[0]['id'],)
        db_manager.execute_query_no_return(query, data)
        query: str = """DELETE FROM KWEEK_HASHTAG WHERE HASHTAG_ID=%s  """
        data = (hid[0]['id'],)
        db_manager.execute_query_no_return(query, data)
    query: str = """SELECT COUNT(*) FROM HASHTAG """
    first_count = db_manager.execute_query(query)[0]['count']
    actions.insert_kweek(kweek_test_1)
    query: str = """SELECT ID FROM KWEEK ORDER BY ID DESC LIMIT 1 """
    kid = db_manager.execute_query(query)[0]['id']
    print("kweek id", kid)
    query: str = """SELECT ID FROM HASHTAG ORDER BY ID DESC LIMIT 1 """
    hid = db_manager.execute_query(query)[0]['id']
    print("hahstag id ", hid)
    query: str = """SELECT ID,TEXT,media_url,username,reply_to FROM KWEEK WHERE ID= %s """
    data = (kid,)
    resulted_kweek = db_manager.execute_query(query, data)[0]
    print("kweek", resulted_kweek)
    query: str = """SELECT * FROM MENTION WHERE  KWEEK_ID= %s"""
    data = (kid,)
    resulted_mention = db_manager.execute_query(query, data)[0]
    query: str = """SELECT TEXT, KWEEK_ID, HASHTAG_ID, STARTING_INDEX, ENDING_INDEX 
     FROM KWEEK_HASHTAG JOIN HASHTAG  ON ID = HASHTAG_ID WHERE KWEEK_ID  = %s"""
    data = (kid,)
    resulted_hashtag = db_manager.execute_query(query, data)[0]
    print("hashtag", resulted_hashtag)
    expected_mention = {'kweek_id': kid, 'username': '******', 'starting_index': 10,
                        'ending_index': 16}
    expected_hahstag = {'text': '#sky', 'kweek_id': kid, 'hashtag_id': hid,
                        'starting_index': 10, 'ending_index': 16}
    expected_kweek = {'id': kid, 'text': '#testtest',
                      'media_url': None, 'username': '******', 'reply_to': None}
    assert expected_kweek == resulted_kweek
    assert expected_hahstag == resulted_hashtag
    assert expected_mention == resulted_mention
    query: str = """SELECT COUNT(*) FROM HASHTAG """
    second_count = db_manager.execute_query(query)[0]['count']
    assert (second_count - first_count) == 1
    check, message = actions.insert_kweek(kweek_test_2)
    assert message == 'success'
    query: str = """SELECT COUNT(*) FROM HASHTAG """
    third_count = db_manager.execute_query(query)[0]['count']
    assert third_count - second_count == 0
    check, message = actions.insert_kweek(kweek_test_3)
    assert message == 'Repeated mention in the same kweek'
    check, message = actions.insert_kweek(kweek_test_4)
    assert message == 'the user mentioned does not exist in the database'
Exemplo n.º 15
0
    def predict(self, document) -> List[Mention]:
        mention_list = self.mention_parser.parse_text(document)

        mentions = []
        for start, end, mention_str, candidates in mention_list:

            prev_start = start - self.context_words_window
            if prev_start < 0: prev_start = 0
            after_end = end + self.context_words_window
            if after_end > len(document): after_end = len(document)
            prev_context_words = [
                word for word in self.word_parser.parse_text(
                    document[prev_start:start])
                if word in self.word_manager.vec_model.vectors
            ]
            after_context_words = [
                word for word in self.word_parser.parse_text(
                    document[end:after_end])
                if word in self.word_manager.vec_model.vectors
            ]
            context_words = prev_context_words
            context_words.extend(after_context_words)

            # 按照 context_words_sim 初步筛选出 valid candidate for mention
            valid_candidates = []  # type: List[Candidate]
            for candidate_id in candidates:
                if self.entity_manager.is_entity_has_embed(candidate_id) and \
                        self.entity_manager.entity_dictionary.entity_dict.get(candidate_id) is not None:
                    candidate = Candidate(candidate_id)
                    candidate.set_entity(
                        self.entity_manager.entity_dictionary.entity_dict.get(
                            candidate_id))

                    candidate.set_context_words_sim(
                        self.cal_candidate_context_words_sim(
                            candidate_id, context_words))
                    if candidate.context_words_sim > self.context_words_sim_th:
                        valid_candidates.append(candidate)

            if len(valid_candidates) > 0:
                mention = Mention(start, end, mention_str, valid_candidates)
                mention.set_prev_context(prev_context_words)
                mention.set_after_context(after_context_words)
                mentions.append(mention)

        # 开始计算 context_entities_similarity
        seed_candidates = []  # type: List[Candidate]

        # 根据 context_words_sim_th_for_seed_candidates 筛选出 seed_candidates
        for i, mention in enumerate(mentions):
            max_sim = -1
            max_cand = None
            for candidate in mention.candidates:
                if candidate.context_words_sim > max_sim:
                    max_cand = candidate
            if max_cand.context_words_sim > self.seed_candidates_sim_th:
                seed_candidates.append(max_cand)
                mention.set_result_cand(max_cand)

        # 为未消歧的 mention 构建 context_entities
        context_entities = []
        for cand in seed_candidates:
            context_entities.append(cand.entity)

        # 为所有的 mention 的 candidate 计算 context_entities_sim
        for i, mention in enumerate(mentions):
            if mention.result_cand is None:
                # 如果是未消歧的 mention,直接计算与 seed_candidates 的相似度
                for j, candidate in enumerate(mentions[i].candidates):
                    mentions[i].set_context_entities(context_entities)
                    mentions[i].candidates[j].set_context_entities_sim(
                        self.cal_candidate_context_entities_sim(
                            candidate.entity_id, seed_candidates))
            else:
                # 如果是已消歧的 mention,则去掉该 mention 的 candidates 得到 seed_candidates_for_mention,计算相似度
                seed_entities_for_mention = []  # type: List[Candidate]
                for seed_cand in seed_candidates:
                    belong_to_mention = False
                    for cand in mention.candidates:
                        if cand.entity_id == seed_cand.entity_id:
                            belong_to_mention = True
                    if not belong_to_mention:
                        seed_entities_for_mention.append(seed_cand)

                for j, candidate in enumerate(mentions[i].candidates):
                    mentions[i].set_context_entities(context_entities)
                    mentions[i].candidates[j].set_context_entities_sim(
                        self.cal_candidate_context_entities_sim(
                            candidate.entity_id, seed_entities_for_mention))

        # 设置 mention 的 believe_score
        for i, mention in enumerate(mentions):
            for cand in mention.candidates:
                cand.set_believe_score(
                    self.words_sim_weight * cand.context_words_sim +
                    (1 - self.words_sim_weight) * cand.context_entities_sim)
            mentions[i].candidates = sorted(
                mention.candidates,
                key=lambda item: item.believe_score,
                reverse=True)
            mentions[i].set_result_cand(mention.candidates[0])

        # 根据 believe_score 再次筛选 mentions
        refined_mentions = []
        for m in mentions:
            if m.result_cand.believe_score > self.believe_score_th:
                refined_mentions.append(m)

        # TODO: expand seed candidates here
        # for i, mention in enumerate(mentions):
        #     for j, candidate in enumerate(mentions[i].candidates):
        #         mentions[i].candidates[j].set_context_entities_sim(
        #             self.cal_candidate_context_entities_sim(candidate.entity_id, seed_candidates))

        return refined_mentions
Exemplo n.º 16
0
    def build_sample(self,
                     mention_list: List,
                     document: str,
                     context_window=-1,
                     context_words_sim_th=-1,
                     seed_candidats_sim_th=-1,
                     believe_score_th=-1):
        """
            1. 由 mention list 和 document 构造 List[Mention] (主要是 prev_context, after_context 和 context_words_sim)
            2. 由 List[Mention] 计算 context_entities_sim.

        Args:
            mention_list: [(start, end, mention, candidates)], the result should come from MentionParser.parse_text(document)
            document: the input document.
            context_window: the window size is the character number, not word number.

        Return: List[Mention]
        """
        if context_window != -1:
            self.context_window = context_window
        if context_words_sim_th != -1:
            self.context_words_sim_th = context_words_sim_th
        if seed_candidats_sim_th != -1:
            self.seed_candidates_sim_th = seed_candidats_sim_th
        if believe_score_th != -1:
            self.believe_score_th = believe_score_th

        mentions = []
        for start, end, mention_str, candidates in mention_list:

            prev_start = start - self.context_window
            if prev_start < 0: prev_start = 0
            after_end = end + self.context_window
            if after_end > len(document): after_end = len(document)
            prev_context_words = [
                word for word in self.word_parser.parse_text(
                    document[prev_start:start])
                if word in self.word_manager.vec_model.vectors
            ]
            after_context_words = [
                word for word in self.word_parser.parse_text(
                    document[end:after_end])
                if word in self.word_manager.vec_model.vectors
            ]
            context_words = prev_context_words
            context_words.extend(after_context_words)

            # 按照 context_words_sim 初步筛选出 valid candidate for mention
            valid_candidates = []  # type: List[Candidate]
            for candidate_id in candidates:
                if self.entity_manager.is_entity_has_embed(candidate_id) and \
                    self.entity_manager.entity_dictionary.entity_dict.get(candidate_id) is not None:
                    candidate = Candidate(candidate_id)
                    candidate.set_entity(
                        self.entity_manager.entity_dictionary.entity_dict.get(
                            candidate_id))

                    candidate.set_context_words_sim(
                        self.cal_candidate_context_words_sim(
                            candidate_id, context_words))
                    if candidate.context_words_sim > self.context_words_sim_th:
                        valid_candidates.append(candidate)

            if len(valid_candidates) > 0:
                mention = Mention(start, end, mention_str, valid_candidates)
                mention.set_prev_context(prev_context_words)
                mention.set_after_context(after_context_words)
                mentions.append(mention)

        # 开始计算 context_entities_similarity
        seed_candidates = []  # type: List[Candidate]

        # 根据 context_words_sim_th_for_seed_candidates 筛选出 seed_candidates
        for i, mention in enumerate(mentions):
            max_sim = -1
            max_cand = None
            for candidate in mention.candidates:
                if candidate.context_words_sim > max_sim:
                    max_cand = candidate
            if max_cand.context_words_sim > self.seed_candidates_sim_th:
                seed_candidates.append(max_cand)
                mention.set_result_cand(max_cand)

        # 为未消歧的 mention 构建 context_entities
        context_entities = []
        for cand in seed_candidates:
            context_entities.append(cand.entity)

        # 为所有的 mention 的 candidate 计算 context_entities_sim
        for i, mention in enumerate(mentions):
            if mention.result_cand is None:
                # 如果是未消歧的 mention,直接计算与 seed_candidates 的相似度
                for j, candidate in enumerate(mentions[i].candidates):
                    mentions[i].set_context_entities(context_entities)
                    mentions[i].candidates[j].set_context_entities_sim(
                        self.cal_candidate_context_entities_sim(
                            candidate.entity_id, seed_candidates))
            else:
                # 如果是已消歧的 mention,则去掉该 mention 的 candidates 得到 seed_candidates_for_mention,计算相似度
                seed_entities_for_mention = []  # type: List[Candidate]
                for seed_cand in seed_candidates:
                    belong_to_mention = False
                    for cand in mention.candidates:
                        if cand.entity_id == seed_cand.entity_id:
                            belong_to_mention = True
                    if not belong_to_mention:
                        seed_entities_for_mention.append(seed_cand)

                for j, candidate in enumerate(mentions[i].candidates):
                    mentions[i].set_context_entities(context_entities)
                    mentions[i].candidates[j].set_context_entities_sim(
                        self.cal_candidate_context_entities_sim(
                            candidate.entity_id, seed_entities_for_mention))

        # 设置 mention 的 believe_score
        for i, mention in enumerate(mentions):
            for cand in mention.candidates:
                cand.set_believe_score(0.3 * cand.context_words_sim +
                                       0.7 * cand.context_entities_sim)
            mentions[i].candidates = sorted(
                mention.candidates,
                key=lambda item: item.believe_score,
                reverse=True)
            mentions[i].set_result_cand(mention.candidates[0])

        # 根据 believe_score 再次筛选 mentions
        refined_mentions = []
        for m in mentions:
            if m.result_cand.believe_score > self.believe_score_th:
                refined_mentions.append(m)

        # TODO: expand seed candidates here
        # for i, mention in enumerate(mentions):
        #     for j, candidate in enumerate(mentions[i].candidates):
        #         mentions[i].candidates[j].set_context_entities_sim(
        #             self.cal_candidate_context_entities_sim(candidate.entity_id, seed_candidates))

        return refined_mentions
Exemplo n.º 17
0
def extract_mentions_hashtags(text):
    """
            Extract mentions and replies for the given kweek.


            *Parameters:*
                - *text*: The text of the kweek to be inserted .

            *Returns:*
                   -*Tuple*: {
                                | *hashtags (hashtag object )*: The list of kweek hashtags,
                                | *mention (mention object )*: The list of kweek mentions.
                                | }

    """
    hashtags = []
    mentions = []
    size = len(text)
    i = 0
    while i < size:
        hashtag_indices_list = []
        mention_indices_list = []
        if text[i] == '#':
            hashtag_indices_list.append(i)
            for i in range(i + 1, len(text)):

                if (i == size - 1 and text[i] == ' ') or text[i] == ' ':
                    hashtag_indices_list.append(i)
                elif i == size - 1:
                    hashtag_indices_list.append(i + 1)
                else:
                    continue
                hashtag_text = text[
                    hashtag_indices_list[0]:hashtag_indices_list[1]]
                hashtag = {
                    'indices': hashtag_indices_list,
                    'text': hashtag_text,
                    'id': 0
                }
                hashtags.append(Hashtag(hashtag))
                break
        if text[i] == '@':
            mention_indices_list.append(i)
            for i in range(i + 1, len(text)):
                if (i == size - 1 and text[i] == ' ') or text[i] == ' ':
                    mention_indices_list.append(i)
                elif i == size - 1:
                    mention_indices_list.append(i + 1)
                else:
                    continue
                mention_username = text[mention_indices_list[0] +
                                        1:(mention_indices_list[1])]
                mention = {
                    'indices': mention_indices_list,
                    'username': mention_username
                }
                mentions.append(Mention(mention))
                break
        i += 1

    return hashtags, mentions  # lists of objects
Exemplo n.º 18
0
def get_kweek(kid, authorized_username, replies_only):
    """
           Get the requested kweek with its credentials.


           *Parameters:*
               - *kid*: The id of the kweek to be retrieved.
               - *authorized_username(string)*: The user currently logged in.
               - *replies_only (bool)*: To indicate whether the kweek with its replies
                  is to be retrieved or the replies only

           *Returns:*
               -*Tuple*: {
                            | *check (bool)*: To indicate whether kweek credentials creation
                            | was successful or not.,
                            | *message (str)*: To specify the reason of failure if detected.
                            | *kweekobj (kweek object )*: the kweek to be retrieved,
                            | *replies (list of int )*: Ids of  the replies to the retrieved kweek .
                            | *code*: The code to be returned in the request.

                            | }

    """
    check, message, code = validate_request(kid)
    if not check:
        return check, message, None, None, code
    replies = retrieve_replies(
        kid
    )  # rows of kweek table who is set as a reply to the retrieved kweek (ids)
    if replies_only:
        return True, message, None, replies, code
    hashtags = retrieve_hashtags(kid)  # rows of hahstag-kweek table (*)
    mentions = retrieve_mentions(kid)  # rows of mention table (*)
    rekweeks = retrieve_user(kid, 3)
    likers = retrieve_user(
        kid,
        2)  # rows of likers table for those who liked the kweek (usernames)
    user = retrieve_user(kid, 1)
    hashtags_list = []  # list of hashtag objects
    mentions_list = []  # list of mention objects
    rekweeked_by_user = False
    liked_by_user = False
    if hashtags:
        for hash_obj in hashtags:
            hid = hash_obj['hashtag_id']
            s_index = hash_obj['starting_index']
            e_index = hash_obj['ending_index']
            indices = [s_index, e_index]
            text = hash_obj['text']
            hash_dic = {'id': hid, 'indices': indices, 'text': text}
            hashtag = Hashtag(hash_dic)
            hashtags_list.append(hashtag)

    if mentions:
        for ment in mentions:
            s_index = ment['starting_index']
            e_index = ment['ending_index']
            indices = [s_index, e_index]
            username = ment['username']
            ment_dic = {'indices': indices, 'username': username}
            mention = Mention(ment_dic)
            mentions_list.append(mention)

    user = user[0]
    extrauser = {}
    me = authorized_username  # should be replaced by the function getting the current user
    check = check_following(me, user['username'])
    if check:
        extrauser['following'] = True
    else:
        extrauser['following'] = False

    check = check_following(user['username'], me)
    if check:
        extrauser['follows_you'] = True
    else:
        extrauser['follows_you'] = False

    check = check_blocked(user['username'], me)
    if check:
        extrauser['blocked'] = True
    else:
        extrauser['blocked'] = False
    check = check_muted(user['username'], me)
    if check:
        extrauser['muted'] = True
    else:
        extrauser['muted'] = False
    extrauser.update(user)

    userobj = User(extrauser)

    if replies:
        num_of_replies = len(replies)
    else:
        num_of_replies = 0

    if likers:
        num_of_likes = len(likers)
        for user in likers:
            if user['username'] == me:
                liked_by_user = True

    else:
        num_of_likes = 0

    if rekweeks:
        num_of_rekweeks = len(rekweeks)
        for user in rekweeks:
            if user['username'] == me:
                rekweeked_by_user = True
    else:
        num_of_rekweeks = 0

    kweekdic = {
        'hashtags': hashtags_list,
        'mentions': mentions_list,
        'number_of_likes': num_of_likes,
        'number_of_rekweeks': num_of_rekweeks,
        'number_of_replies': num_of_replies,
        'rekweek_info': None,
        'liked_by_user': liked_by_user,
        'rekweeked_by_user': rekweeked_by_user,
        'user': userobj
    }
    kweek = retrieve_kweek(kid)  # a row of kweek table
    kweek = kweek[0]
    kweekdic.update(kweek)
    kweekdic['reply_info'] = get_reply_to_info(kid)
    kweekobj = Kweek(kweekdic)
    return True, 'success.', kweekobj, replies, 200
Exemplo n.º 19
0
    def update(self):
        """
        Update tweets related to movies in local DB
        :return: None
        """
        # Get name for all the stored movies in the DB
        movie_obj = Movie(db)
        movies = movie_obj.get_names()
        print('Got movies')
        if movies:
            for movie in movies:
                hashtag = self.get_hashtag(movie['MV_NAME'])
                mv_id = movie['MV_ID']
                # Search twitter for current movie hashtag in english language
                print('Searching for hashtag {}'.format(hashtag))
                results = self.api.GetSearch(hashtag, lang='en', count=100)
                # Get data for each tweet in search results and save to respective tables
                for tweet in results:
                    print(tweet)
                    user_keys = [
                        'id_str', 'name', 'description', 'created_at',
                        'created_at', 'followers_count', 'friends_count'
                    ]
                    user_data = []
                    for k in user_keys:
                        user_data.append(tweet.user.__getattribute__(k))

                    # split time format before saving to the DB
                    timestamp = datetime.strptime(user_data[3],
                                                  self.tweet_time_format)
                    user_data[3] = timestamp.strftime(self.date_format)
                    user_data[4] = timestamp.strftime(self.time_format)

                    try:
                        u = User(db)
                        if u.select_one(user_data[0]):
                            u.update(user_data)
                        else:
                            u.insert(user_data)
                    except Exception:
                        # pass any exception occurred during the insert/update operation
                        pass

                    timestamp = datetime.strptime(tweet.created_at,
                                                  self.tweet_time_format)
                    date = timestamp.strftime(self.date_format)
                    time = timestamp.strftime(self.time_format)
                    tweet_data = [
                        tweet.id, tweet.full_text, hashtag, user_data[0], date,
                        time, tweet.retweet_count
                    ]
                    try:
                        t = Tweet(db)
                        t.insert(tweet_data)
                    except Exception:
                        # pass any exception occurred during the insert operation
                        pass
                    try:
                        tm = TweetMovie(db)
                        tm.insert([tweet.id, mv_id])
                    except Exception:
                        # pass any exception occurred during the insert operation
                        pass
                    # Add tweet mentions to the mentions table and any new user mentioned to the user table
                    mentions = tweet.user_mentions
                    if mentions:
                        for mention in mentions:
                            m = Mention(db)
                            try:
                                m.insert([tweet.id, mention.id])
                            except Exception:
                                pass

                            try:
                                # Add user to the user table if not exists
                                u = User(db)
                                u.insert_mention_user(
                                    [mention.id, mention.name])
                            except Exception:
                                # pass any exception occurred during the insert/update operation
                                pass
Exemplo n.º 20
0
def test_get_kweek_with_replies():
    # first kweek #

    query: str = """INSERT INTO  KWEEK (CREATED_AT,TEXT,MEDIA_URL,USERNAME,REPLY_TO) VALUES(%s, %s, %s, %s,%s) """
    data = ('01-01-2010', 'test1', None, 'test_user1', None)
    db_manager.execute_query_no_return(query, data)
    kid1 = str(db_manager.execute_query("""SELECT ID FROM KWEEK ORDER BY ID DESC LIMIT 1 """)[0]['id'])

    query: str = """INSERT INTO HASHTAG(TEXT) VALUES (%s) """
    data = ('hashtag1---',)
    db_manager.execute_query_no_return(query, data)

    query: str = """SELECT ID FROM HASHTAG WHERE TEXT = %s """
    data = ('hashtag1---',)
    hid1 = db_manager.execute_query(query, data)[0]['id']

    query: str = """INSERT INTO KWEEK_HASHTAG VALUES (%s,%s,%s,%s)"""
    data = (kid1, hid1, 0, 9,)
    db_manager.execute_query_no_return(query, data)

    query: str = """INSERT INTO MENTION VALUES(%s,%s,%s,%s) """
    data = (kid1, 'test_user2', 10, 15)
    db_manager.execute_query_no_return(query, data)

    query: str = """INSERT INTO REKWEEK VALUES(%s,%s,%s) """
    data = ('test_user2', kid1, '01-01-2010')
    db_manager.execute_query_no_return(query, data)

    query: str = """INSERT INTO FAVORITE VALUES(%s,%s,%s) """
    data = ('test_user2', kid1, '01-01-2010')
    db_manager.execute_query_no_return(query, data)

    # second kweek #

    query: str = """INSERT INTO  KWEEK (CREATED_AT,TEXT,MEDIA_URL,USERNAME,REPLY_TO) VALUES(%s, %s, %s, %s,%s) """
    data = ('01-01-2010', 'test2', None, 'test_user2', kid1)
    db_manager.execute_query_no_return(query, data)
    kid2 = str(db_manager.execute_query("""SELECT ID FROM KWEEK ORDER BY ID DESC LIMIT 1 """)[0]['id'])

    query: str = """INSERT INTO HASHTAG(TEXT) VALUES (%s) """
    data = ('hashtag2---',)
    db_manager.execute_query_no_return(query, data)

    query: str = """SELECT ID FROM HASHTAG WHERE TEXT = %s """
    data = ('hashtag2---',)
    hid2 = db_manager.execute_query(query, data)[0]['id']

    query: str = """INSERT INTO KWEEK_HASHTAG VALUES (%s,%s,%s,%s)"""
    data = (kid2, hid2, 0, 9,)
    db_manager.execute_query_no_return(query, data)

    query: str = """INSERT INTO FAVORITE VALUES(%s,%s,%s) """
    data = ('test_user1', kid2, '01-01-2010')
    db_manager.execute_query_no_return(query, data)

    # third kweek #

    query: str = """INSERT INTO  KWEEK (CREATED_AT,TEXT,MEDIA_URL,USERNAME,REPLY_TO) VALUES(%s, %s, %s, %s,%s) """
    data = ('01-01-2010', 'test3', None, 'test_user3', kid1)
    db_manager.execute_query_no_return(query, data)
    kid3 = str(db_manager.execute_query("""SELECT ID FROM KWEEK ORDER BY ID DESC LIMIT 1 """)[0]['id'])

    query: str = """INSERT INTO HASHTAG(TEXT) VALUES (%s) """
    data = ('hashtag3---',)
    db_manager.execute_query_no_return(query, data)

    query: str = """SELECT ID FROM HASHTAG WHERE TEXT = %s """
    data = ('hashtag3---',)
    hid3 = db_manager.execute_query(query, data)[0]['id']

    query: str = """INSERT INTO KWEEK_HASHTAG VALUES (%s,%s,%s,%s)"""
    data = (kid3, hid3, 0, 9,)
    db_manager.execute_query_no_return(query, data)

    query: str = """INSERT INTO FAVORITE VALUES(%s,%s,%s) """
    data = ('test_user3', kid3, '01-01-2010')
    db_manager.execute_query_no_return(query, data)

    kweek_test1 = Kweek({
        'id': int(kid1),
        'created_at': datetime(2010, 1, 1, 0, 0),
        'text': 'test1',
        'media_url': None,
        'user': User({
            'username': '******',
            'screen_name': 'test1',
            'profile_image_url': 'image_url',
            'following': True,
            'follows_you': True,
            'muted': False,
            'blocked': False
        }),
        'mentions': [
            Mention({
                'username': '******',
                'indices': [10, 15]})

        ],
        'hashtags': [
            Hashtag({
                'text': 'hashtag1---',
                'indices': [0, 9],
                'id': hid1
            })
        ],
        'number_of_likes': 1,
        'number_of_rekweeks': 1,
        'number_of_replies': 2,
        'reply_to': None,
        'rekweek_info': None,
        'liked_by_user': False,
        'rekweeked_by_user': False
    })
    replies_test1 = [
        Kweek({
            'id': int(kid2),
            'created_at': datetime(2010, 1, 1, 0, 0),
            'text': 'test2',
            'media_url': None,
            'user': User({
                'username': '******',
                'screen_name': 'test2',
                'profile_image_url': 'image_url',
                'following': False,
                'follows_you': True,
                'muted': False,
                'blocked': False
            }),
            'mentions': [
            ],
            'hashtags': [
                Hashtag({
                    'text': 'hashtag2---',
                    'indices': [0, 9],
                    'id': hid2
                })
            ],
            'number_of_likes': 1,
            'number_of_rekweeks': 0,
            'number_of_replies': 0,
            'reply_to': int(kid1),
            'rekweek_info': None,
            'liked_by_user': False,
            'rekweeked_by_user': False
        }), Kweek({
            'id': int(kid3),
            'created_at': datetime(2010, 1, 1, 0, 0),
            'text': 'test3',
            'media_url': None,
            'user': User({
                'username': '******',
                'screen_name': 'test3',
                'profile_image_url': 'image_url',
                'following': False,
                'follows_you': False,
                'muted': False,
                'blocked': False
            }),
            'mentions': [
            ],
            'hashtags': [
                Hashtag({
                    'text': 'hashtag3---',
                    'indices': [0, 9],
                    'id': hid3
                })
            ],
            'number_of_likes': 1,
            'number_of_rekweeks': 0,
            'number_of_replies': 0,
            'reply_to': int(kid1),
            'rekweek_info': None,
            'liked_by_user': True,
            'rekweeked_by_user': False
        }),

    ]
    check_replies, message, k, r = actions.get_kweek_with_replies(kid1, 'test_user3')
    print('kwweeek')
    print(k)
    print('replies')
    print(r)
    assert True == check_replies
    assert message == 'success'
    assert k.to_json() == kweek_test1.to_json()
    for n, i in enumerate(r):
        assert i.to_json() == replies_test1[n].to_json()
Exemplo n.º 21
0
from sqlalchemy import create_engine
from models import Source, Mention, Base
from sqlalchemy.orm import sessionmaker

engine = create_engine('postgresql+psycopg2://docker:docker@db/docker')
Session = sessionmaker(bind=engine)
Base.metadata.create_all(engine)
session = Session()

s = Source(id=1, name='Twitter')
m = Mention(id=1, source=s, text='jeffknupp.com is the best website ever!')
session.add(s)
session.add(m)
session.commit()