Exemplo n.º 1
0
def text_iterator(use_wiki, wiki_location,
                  use_qb, qb_location,
                  use_source, source_location,
                  limit=-1,
                  min_pages=0, country_list=COUNTRY_LIST_PATH):
    if isinstance(qb_location, str):
        qdb = QuestionDatabase(qb_location)
    else:
        qdb = qb_location
    doc_num = 0

    cw = CachedWikipedia(wiki_location, data_path(country_list))
    pages = qdb.questions_with_pages()

    for p in sorted(pages, key=lambda k: len(pages[k]), reverse=True):
        # This bit of code needs to line up with the logic in qdb.py
        # to have the same logic as the page_by_count function
        if len(pages[p]) < min_pages:
            continue

        if use_qb:
            train_questions = [x for x in pages[p] if x.fold == "train"]
            question_text = "\n".join(" ".join(x.raw_words()) for x in train_questions)
        else:
            question_text = ''

        if use_source:
            filename = '%s/%s' % (source_location, p)
            if os.path.isfile(filename):
                try:
                    with gzip.open(filename, 'rb') as f:
                        source_text = f.read()
                except zlib.error:
                    log.info("Error reading %s" % filename)
                    source_text = ''
            else:
                source_text = ''
        else:
            source_text = u''

        if use_wiki:
            wikipedia_text = cw[p].content
        else:
            wikipedia_text = u""

        total_text = wikipedia_text
        total_text += "\n"
        total_text += question_text
        total_text += "\n"
        total_text += str(source_text)

        yield p, total_text
        doc_num += 1

        if 0 < limit < doc_num:
            break
Exemplo n.º 2
0
def train_classifier(class_type, question_db=None):
    if question_db is None:
        question_db = QuestionDatabase(QB_QUESTION_DB)

    log.info("Training classifier: {}".format(class_type))
    all_questions = question_db.questions_with_pages()
    train = compute_features(all_questions, 'train', class_type)
    train_x = train['text']
    train_y = train['label']
    classifier = pipeline_creators[class_type]().fit(train_x, train_y)
    return classifier
Exemplo n.º 3
0
    def initialize_cache(path):
        """
        This function iterates over all pages and accessing them in the cache. This forces a
        prefetch of all wiki pages
        """
        db = QuestionDatabase(QB_QUESTION_DB)
        pages = db.questions_with_pages()
        cw = CachedWikipedia(path)
        pool = Pool()

        input_data = [(format_guess(title), cw) for title in pages.keys()]
        pool.starmap(access_page, input_data)
Exemplo n.º 4
0
def web_initialize_file_cache(path, remote_delay=1):
    """
    Initialize the cache by requesting each page with wikipedia package.
    This function iterates over all pages and accessing them in the cache. This forces a
    prefetch of all wiki pages
    """
    db = QuestionDatabase()
    pages = db.questions_with_pages()
    cw = CachedWikipedia(path, remote_delay=remote_delay)
    pool = Pool()

    input_data = [(title, cw) for title in pages.keys()]
    pool.starmap(access_page, input_data)
Exemplo n.º 5
0
def preprocess_titles():
    # stop_words = set(stopwords.words('english'))
    titles_file = open('data/titles-sorted.txt')
    db = QuestionDatabase()
    pages = {format_guess(page) for page in db.questions_with_pages().keys()}
    with open('data/processed-titles-sorted.txt', 'w') as f:
        for line in titles_file:
            page = format_guess(line.strip().lower())
            # if len(page) > 2 and re.match(r"^[a-zA-Z0-9_()']+$", page)\
            #         and page not in stop_words and page[0].isalnum():
            if page in pages:
                f.write(line.strip().lower())
            else:
                f.write('@')
            f.write('\n')
    titles_file.close()
Exemplo n.º 6
0
def wikify(output_directory):
    database = QuestionDatabase(QB_QUESTION_DB)
    pages = database.questions_with_pages()

    total = 0
    for p in pages:
        if len(pages[p]) >= conf['wikifier']['min_appearances']:
            log.info('{} {}'.format(p, len(pages[p])))
            for q in pages[p]:
                total += 1
                for sentence, word, text in q.partials():
                    sentence -= 1
                    with open(
                            "%s/%i-%i.txt" %
                        (output_directory, q.qnum, sentence), 'w') as output:
                        output.write("%s\n" % text[sentence])
    log.info(str(total))
Exemplo n.º 7
0
Arquivo: stats.py Projeto: xxlatgh/qb
    def __init__(self):
        super(StatsExtractor, self).__init__()
        with open(SENTENCE_STATS, 'rb') as f:
            self.word_count_mean, self.word_count_std = pickle.load(f)

        self.guess_frequencies = {}
        question_db = QuestionDatabase(QB_QUESTION_DB)
        all_questions = question_db.questions_with_pages()
        for page in all_questions:
            self.guess_frequencies[page] = sum(1 for x in all_questions[page]
                                               if x.fold == "train")

        self.frequency_mean = np.mean(list(self.guess_frequencies.values()))
        self.frequency_std = np.std(list(self.guess_frequencies.values()))
        for page in all_questions:
            normalized_frequency = normalize(self.guess_frequencies[page],
                                             self.frequency_mean,
                                             self.frequency_std)
            self.guess_frequencies[page] = normalized_frequency
        self.normed_missing_guess = normalize(0, self.frequency_mean,
                                              self.frequency_std)
Exemplo n.º 8
0
def create_report(classifier, class_type, question_db=None):
    if question_db is None:
        question_db = QuestionDatabase(QB_QUESTION_DB)

    all_questions = question_db.questions_with_pages()
    train = compute_features(all_questions, 'train', class_type)
    train_x = train['text']
    train_y = train['label']
    dev = compute_features(all_questions, 'dev', class_type)
    dev_x = dev['text']
    dev_y = dev['label']
    train_score = classifier.score(train_x, train_y)
    dev_score = classifier.score(dev_x, dev_y)

    true_labels = dev['label'].values
    predicted_labels = classifier.predict(dev_x)

    cf_norm = '/tmp/norm_confusion.png'
    plot_confusion(
        'Row Normalized Confusion Matrix of {} Classification'.format(
            class_type),
        true_labels,
        predicted_labels,
        normalized=True)
    plt.savefig(cf_norm, format='png', dpi=200)
    plt.clf()
    plt.cla()
    plt.close()

    cf_unnorm = '/tmp/unnorm_confusion.png'
    plot_confusion('Unnormalized Confusion Matrix of {} Classification'.format(
        class_type),
                   true_labels,
                   predicted_labels,
                   normalized=False)
    plt.savefig(cf_unnorm, format='png', dpi=200)

    correct_by_position = '/tmp/correct_by_position.png'

    dev['prediction'] = pd.Series(predicted_labels)
    dev['correct'] = dev['prediction'] == dev['label']
    pd.pivot_table(dev,
                   values=['text'],
                   index=['sentence', 'correct'],
                   aggfunc=lambda x: len(x)).unstack(fill_value=0).plot.bar(
                       title='Number of Questions Correct vs Sentence Number')
    plt.xlabel('Sentence Number')
    plt.ylabel('Number Correct')
    handles, labels = plt.gca().get_legend_handles_labels()
    plt.gca().legend(handles, ['Number Incorrect', 'Number Correct'])
    plt.savefig(correct_by_position, format='png', dpi=200)

    report = ReportGenerator(
        {
            'unnormalized_confusion_plot': cf_unnorm,
            'normalized_confusion_plot': cf_norm,
            'correct_by_position_plot': correct_by_position,
            'train_score': train_score,
            'dev_score': dev_score,
            'class_type': class_type
        }, 'classifier.md')
    output = safe_path(CLASSIFIER_REPORT_PATH.format(class_type))
    report.create(output)
    plt.clf()
    plt.cla()
    plt.close()