示例#1
0
def main():
    sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer)
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-i', '--input', help='Input file', required=True)
    parser.add_argument('-t', '--test', help='Test file', required=True)
    parser.add_argument('-o', '--output', help='Output filename prefix', required=True)
    parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0)
    parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000)
    args = parser.parse_args()

    data = read_semeval_regression(args.input, encoding='windows-1252')

    analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4])

    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=analyzer)),
        ('tfidf', TfidfTransformer()),
        ('sel', SelectKBest(chi2, k=args.k)),
        ('clf', BinaryTreeRegressor(base_estimator=LinearSVC(C=args.c), verbose=False)),
    ])

    test = read_test_data(args.test, encoding='windows-1252')

    regressor = pipeline.fit(data[0], data[1])

    y = regressor.predict(test[2])

    with open('%sc%f-k%i-C.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as outfile:
        for id_, topic, rate in zip(test[0], test[1], y):
            print(id_, topic, rate, sep='\t', file=outfile)
 def __init__(self, pipeline=None):
     if pipeline is None:
         self._pipeline = BinaryTreeRegressor(base_estimator=LinearSVC(C=100.0))
     else:
         self._pipeline = pipeline
class RegressionQuantifier(object):
    def __init__(self, pipeline=None):
        if pipeline is None:
            self._pipeline = BinaryTreeRegressor(base_estimator=LinearSVC(C=100.0))
        else:
            self._pipeline = pipeline

    def fit(self, X, y, groups):
        self._true_global_prevalences = defaultdict(float)
        self._values = list(set(y))
        self._values.sort()
        for rate in self._values:
            self._true_global_prevalences[rate] = y.count(rate) / len(y)
        self._estimated_global_prevalences = defaultdict(float)
        for group in set(groups):
            group_X = [atext for atext, agroup in zip(X, groups) if agroup == group]
            notgroup_X = [atext for atext, agroup in zip(X, groups) if agroup != group]
            notgroup_y = [alabel for alabel, agroup in zip(y, groups) if agroup != group]
            pipclone = clone(self._pipeline)
            model = pipclone.fit(notgroup_X, notgroup_y)
            predictions = model.predict(group_X)

            for rate in self._values:
                self._estimated_global_prevalences[rate] += predictions.count(rate)

        for rate in self._values:
            self._estimated_global_prevalences[rate] /= len(y)

        self._model = self._pipeline.fit(X, y)

    def predict(self, X, groups):
        predictions = self._model.predict(X)
        quantifications = dict()
        test_global_prevalences = defaultdict(float)
        for rate in self._values:
            test_global_prevalences[rate] = predictions.count(rate) / len(X)
        for group in set(groups):
            group_predictions = [prediction for prediction, agroup in zip(predictions, groups) if agroup == group]
            simple_prevanlences = list()
            corrected_prevalences = list()
            test_corrected_prevalences = list()
            for rate in self._values:
                prevalence = group_predictions.count(rate) / len(group_predictions)
                simple_prevanlences.append(prevalence)
                if self._estimated_global_prevalences[rate] != 0:
                    corrected_prevalences.append(
                        prevalence * self._true_global_prevalences[rate] / self._estimated_global_prevalences[rate])
                else:
                    corrected_prevalences.append(prevalence)
                if test_global_prevalences[rate] != 0:
                    test_corrected_prevalences.append(
                        prevalence * self._true_global_prevalences[rate] / test_global_prevalences[rate])
                else:
                    test_corrected_prevalences.append(prevalence)

            cumulative = sum(corrected_prevalences)
            corrected_prevalences = [corrected_prevalence / cumulative for corrected_prevalence in
                                     corrected_prevalences]
            cumulative = sum(test_corrected_prevalences)
            test_corrected_prevalences = [test_corrected_prevalence / cumulative for test_corrected_prevalence in
                                          test_corrected_prevalences]

            quantifications[group] = (simple_prevanlences, corrected_prevalences, test_corrected_prevalences)
        return quantifications
示例#4
0
 def __init__(self, pipeline=None):
     if pipeline is None:
         self._pipeline = BinaryTreeRegressor(base_estimator=LinearSVC(
             C=100.0))
     else:
         self._pipeline = pipeline
示例#5
0
class RegressionQuantifier(object):
    def __init__(self, pipeline=None):
        if pipeline is None:
            self._pipeline = BinaryTreeRegressor(base_estimator=LinearSVC(
                C=100.0))
        else:
            self._pipeline = pipeline

    def fit(self, X, y, groups):
        self._true_global_prevalences = defaultdict(float)
        self._values = list(set(y))
        self._values.sort()
        for rate in self._values:
            self._true_global_prevalences[rate] = y.count(rate) / len(y)
        self._estimated_global_prevalences = defaultdict(float)
        for group in set(groups):
            group_X = [
                atext for atext, agroup in zip(X, groups) if agroup == group
            ]
            notgroup_X = [
                atext for atext, agroup in zip(X, groups) if agroup != group
            ]
            notgroup_y = [
                alabel for alabel, agroup in zip(y, groups) if agroup != group
            ]
            pipclone = clone(self._pipeline)
            model = pipclone.fit(notgroup_X, notgroup_y)
            predictions = model.predict(group_X)

            for rate in self._values:
                self._estimated_global_prevalences[rate] += predictions.count(
                    rate)

        for rate in self._values:
            self._estimated_global_prevalences[rate] /= len(y)

        self._model = self._pipeline.fit(X, y)

    def predict(self, X, groups):
        predictions = self._model.predict(X)
        quantifications = dict()
        test_global_prevalences = defaultdict(float)
        for rate in self._values:
            test_global_prevalences[rate] = predictions.count(rate) / len(X)
        for group in set(groups):
            group_predictions = [
                prediction for prediction, agroup in zip(predictions, groups)
                if agroup == group
            ]
            simple_prevanlences = list()
            corrected_prevalences = list()
            test_corrected_prevalences = list()
            for rate in self._values:
                prevalence = group_predictions.count(rate) / len(
                    group_predictions)
                simple_prevanlences.append(prevalence)
                if self._estimated_global_prevalences[rate] != 0:
                    corrected_prevalences.append(
                        prevalence * self._true_global_prevalences[rate] /
                        self._estimated_global_prevalences[rate])
                else:
                    corrected_prevalences.append(prevalence)
                if test_global_prevalences[rate] != 0:
                    test_corrected_prevalences.append(
                        prevalence * self._true_global_prevalences[rate] /
                        test_global_prevalences[rate])
                else:
                    test_corrected_prevalences.append(prevalence)

            cumulative = sum(corrected_prevalences)
            corrected_prevalences = [
                corrected_prevalence / cumulative
                for corrected_prevalence in corrected_prevalences
            ]
            cumulative = sum(test_corrected_prevalences)
            test_corrected_prevalences = [
                test_corrected_prevalence / cumulative
                for test_corrected_prevalence in test_corrected_prevalences
            ]

            quantifications[group] = (simple_prevanlences,
                                      corrected_prevalences,
                                      test_corrected_prevalences)
        return quantifications
def main():
    sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer)
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-i', '--input', help='Input file', required=True)
    parser.add_argument('-t', '--test', help='Test file', required=True)
    parser.add_argument('-o',
                        '--output',
                        help='Output filename prefix',
                        required=True)
    parser.add_argument('-c',
                        '--c',
                        help='C value for SVM',
                        type=float,
                        default=1.0)
    parser.add_argument('-k',
                        '--k',
                        help='Number of features to keep',
                        type=int,
                        default=1000)
    args = parser.parse_args()

    data = read_semeval_quantification_regression(args.input,
                                                  encoding='windows-1252')

    texts = list()
    labels = list()
    topics = list()
    for topic in data:
        topic_texts, topic_labels = data[topic]
        texts.extend(topic_texts)
        labels.extend(topic_labels)
        topics.extend([topic for _ in topic_labels])

    analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4])

    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=analyzer)),
        ('tfidf', TfidfTransformer()),
        ('sel', SelectKBest(chi2, k=args.k)),
        ('clf',
         BinaryTreeRegressor(base_estimator=LinearSVC(C=args.c),
                             verbose=False)),
    ])

    _, test_topics, test_texts = read_test_data(args.test,
                                                encoding='windows-1252')

    quantifier = RegressionQuantifier(pipeline)

    quantifier.fit(texts, labels, topics)

    quantification = quantifier.predict(test_texts, test_topics)

    sorted_topics = list(quantification)
    sorted_topics.sort()
    with open('%sc%f-k%i-plain-E.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as plainfile, \
            open('%sc%f-k%i-corrected_train-E.output' % (args.output, args.c, args.k), 'w',
                 encoding='utf8') as corrected_trainfile, \
            open('%sc%f-k%i-corrected_test-E.output' % (args.output, args.c, args.k), 'w',
                 encoding='utf8') as corrected_testfile:
        for topic in sorted_topics:
            plain, corrected_train, corrected_test = quantification[topic]
            print(topic, *plain, sep='\t', file=plainfile)
            print(topic, *corrected_train, sep='\t', file=corrected_trainfile)
            print(topic, *corrected_test, sep='\t', file=corrected_testfile)