示例#1
0
文件: explorer.py 项目: michal3141/ed
def explore_questions_by_topic(quora_data, root_question, topic):
    already_explored_questions = set()
    d = {}
    for document in quora_data:
        question = _get_question(document)
        d[question] = document[question]
        d[question]['related_questions'] = [_sanitize_question(x) for x in d[question]['related_questions']]
    
    questions_queue = []
    already_explored_questions.add(root_question)
    questions_queue.append([root_question])

    questions_path = []
    while questions_queue:
        # get the first path from the queue
        questions_path = questions_queue.pop(0)
        # get the last node from the path
        question = questions_path[-1]
        # questions_path found
        print d[question]['topics']
        if topic in d[question]['topics']:
            break
        # enumerate all adjacent nodes, construct a new path and push it into the queue
        for related_question in d[question]['related_questions']:
            if related_question not in already_explored_questions and related_question in d:
                already_explored_questions.add(related_question)
                new_questions_path = list(questions_path)
                new_questions_path.append(related_question)
                questions_queue.append(new_questions_path)

    print 'Path leading to %s : %r' % (topic, questions_path)
示例#2
0
文件: explorer.py 项目: michal3141/ed
def explore_questions_by_topic(quora_data, root_question, topic):
    already_explored_questions = set()
    d = {}
    for document in quora_data:
        question = _get_question(document)
        d[question] = document[question]
        d[question]['related_questions'] = [
            _sanitize_question(x) for x in d[question]['related_questions']
        ]

    questions_queue = []
    already_explored_questions.add(root_question)
    questions_queue.append([root_question])

    questions_path = []
    while questions_queue:
        # get the first path from the queue
        questions_path = questions_queue.pop(0)
        # get the last node from the path
        question = questions_path[-1]
        # questions_path found
        print d[question]['topics']
        if topic in d[question]['topics']:
            break
        # enumerate all adjacent nodes, construct a new path and push it into the queue
        for related_question in d[question]['related_questions']:
            if related_question not in already_explored_questions and related_question in d:
                already_explored_questions.add(related_question)
                new_questions_path = list(questions_path)
                new_questions_path.append(related_question)
                questions_queue.append(new_questions_path)

    print 'Path leading to %s : %r' % (topic, questions_path)
示例#3
0
文件: crawler.py 项目: michal3141/ed
    def _crawl_by_question(self, question, depth):
        # Stopping crawling when depth exceeds maxdepth
        if depth > self.maxdepth:
            return

        # Not crawling the question that was already crawled
        if question in self.crawled_questions or question in self.bad_questions:
            return

        print 'crawling question: %s' % question
        question_stats = Quora.get_question_stats(question)

        # If something went awry crawling particular question
        if question_stats == {}:
            self.bad_questions.add(question)
            return

        latest_answers = Quora.get_latest_answers(question)
        question_stats['latest_answers'] = latest_answers

        print 'question_stats:\n', question_stats
        print 'latest_answers:\n', latest_answers
        print 'related_questions: \n', question_stats['related_questions']
        print '---------------------------------------------------'

        self.crawled_questions[question] = question_stats

        # Inserting into database as we go...
        self.db.questions.insert({question: question_stats})

        for related_question in question_stats['related_questions']:
            # Only considering complete questions (i.e. not ending in ...)
            if not related_question.endswith('...'):
                self._crawl_by_question(_sanitize_question(related_question),
                                        depth + 1)