Пример #1
0
def main(docids, directory):
    good_cnt = 0
    for i, id_ in enumerate(docids):
        if i % 1000 == 0:
            logger.info('{}/{}/{}'.format(good_cnt, i, len(docids)))

        path = os.path.join(directory, id_)
        titles, _ = separate_title_from_body(path + '.auxil', path + '.paf')
        tokens = [t['token'] for t in titles[0]['features']]
        if not is_monocase(tokens):
            print(id_)
            good_cnt += 1
def main(docids, directory):
    good_cnt = 0
    for i, id_ in enumerate(docids):
        if i % 1000 == 0:
            logger.info('{}/{}/{}'.format(good_cnt, i, len(docids)))

        path = os.path.join(directory, id_)
        titles, _ = separate_title_from_body(path + '.auxil', path + '.paf')
        tokens = [t['token']
                  for t in titles[0]['features']]
        if not is_monocase(tokens):
            print(id_)
            good_cnt += 1
def printable_train_data(malform_data_dir,
                         okform_data_dir,
                         ids,
                         extractor,
                         feature_names,
                         start,
                         end=None,
                         title_transform_func=make_capitalized_title,
                         exclude_labels=None,
                         exclude_word_positions=set([0])):
    """

    Adapted to PULS requirement:
    
    - auxil file is read to get the additional prepreocessed features

    Parameters
    ------------
    malform_data_dir: string
        the directory where the malformed data reside
    okform_data_dir: string
        the directory where the correctly formed data reside
    ids: list of string
        document ids
    extractor: FeatureExtractor
        the feature extractor
    feature_names: list of string
        the feature names
    start, end: int
        how many titles to extract
    title_transform_func: function
        funtion that accepts the title and transforms it
        into some badly capitalized version
    exclude_labels: iterable of str
        labels that we don't consider

    Returns
    ------------
    Generator of str:
        each str is one sentence, each line in the str is one token in the sent
        
    """
    feature_names += ['y']  # add the label feature name
    malform_data_dir = Path(malform_data_dir)

    # take care of this ["tickerSymbol",["NYSE","SKT"]]
    # /cs/taatto/home/hxiao/capitalization-recovery/corpus/puls-format-capitalized/3987E0BD03749C996A04B881079AD753.auxil
    clean_tag = (lambda t: t[0] if isinstance(t, list) else t)
    get_tokens = partial(map, partial(get_in, ['token']))
    get_tags = partial(map, compose(clean_tag, partial(get_in, ['pos'])))
    get_lemmas = partial(map, partial(get_in, ['lemma']))

    n_collected = 0

    for i, id_ in enumerate(ids):
        if i < start:
            continue

        if i % 1000 == 0:
            logger.info("Collected %d" % n_collected)
            logger.info("Finished %d" % i)

        if end is not None and i >= end:
            logger.info("Reached %d. Terminate." % end)
            break

        try:
            malform_auxil_path = (malform_data_dir /
                                  Path(id_)).with_suffix('.auxil')
            with malform_auxil_path.open(encoding='utf8') as f:
                logger.debug('processing: {}'.format(id_))
                # to get the last line
                lines = f.readlines()
                if len(lines) == 0:
                    raise EmptyFileError(
                        'auxil file empty: {}'.format(malform_auxil_path))

                l = lines[-1]

                data = json.loads(l.strip())

                okform_auxil_path = str(
                    (okform_data_dir / Path(id_)).with_suffix('.auxil'))
                okform_paf_path = str(
                    (okform_data_dir / Path(id_)).with_suffix('.paf'))

                good_title_sents, body_sents = separate_title_from_body(
                    okform_auxil_path, okform_paf_path)

                # extract the tokens
                doc = [[t['token'] for t in sent['features']]
                       for sent in body_sents]

                good_title_sents = list(good_title_sents)

                bad_title_sents = data['sents']
                if not isinstance(bad_title_sents, list):
                    raise InvalidTitleError(
                        'bad_title_sents not a list: {}'.format(
                            bad_title_sents))

                # we only consider headline that contains only ONE sentence
                if (len(good_title_sents) == 1 and len(bad_title_sents) == 1):
                    good_sent = good_title_sents[0]
                    bad_sent = bad_title_sents[0]
                    good_title_tokens = get_tokens(good_sent['features'])
                    bad_title_tokens = get_tokens(bad_sent['features'])

                    # some validity checking
                    if len(good_title_tokens) != len(bad_title_tokens):
                        raise TitleInconsistencyError('{}\n{}'.format(
                            good_title_tokens, bad_title_tokens))

                    good_title_tokens_lower = map(lambda s: s.lower(),
                                                  good_title_tokens)
                    bad_title_tokens_lower = map(lambda s: s.lower(),
                                                 bad_title_tokens)
                    if (good_title_tokens_lower != bad_title_tokens_lower):
                        raise TitleInconsistencyError('{}\n{}'.format(
                            good_title_tokens_lower, bad_title_tokens_lower))

                    tags = get_tags(bad_sent['features'])
                    lemmas = get_lemmas(bad_sent['features'])

                    # tag validity checking
                    for tag in tags:
                        if not (tag is None or isinstance(tag, basestring)):
                            raise InvalidTitleError(
                                '{}: tag {} not string'.format(id_, tag))

                    # get malformed title tokens
                    words = convert_to_trainable_format(good_title_tokens,
                                                        title_transform_func,
                                                        extractor,
                                                        doc=doc,
                                                        pos=tags,
                                                        lemma=lemmas)

                    # format the features in the required form
                    res = unicode()
                    for i, word in enumerate(words):
                        if (i not in exclude_word_positions and exclude_labels
                                and word['y'] not in exclude_labels):
                            word_feature_str = u'\t'.join([
                                unicode(word[feature_name])
                                for feature_name in feature_names
                            ])
                            res += word_feature_str + '\n'
                    n_collected += 1
                    yield id_, res
                else:
                    raise TitleInconsistencyError(
                        '# of title sentences more than 1: {}'.format(id_))
        except (IOError, TitleInconsistencyError, InvalidTitleError,
                EmptyFileError):
            logger.debug(traceback.format_exc())
            continue
        except:
            logger.error(traceback.format_exc())
            continue
def printable_train_data(malform_data_dir,
                         okform_data_dir,
                         ids,
                         extractor, feature_names,
                         start, end=None,
                         title_transform_func=make_capitalized_title,
                         exclude_labels=None,
                         exclude_word_positions=set([0])):
    """

    Adapted to PULS requirement:
    
    - auxil file is read to get the additional prepreocessed features

    Parameters
    ------------
    malform_data_dir: string
        the directory where the malformed data reside
    okform_data_dir: string
        the directory where the correctly formed data reside
    ids: list of string
        document ids
    extractor: FeatureExtractor
        the feature extractor
    feature_names: list of string
        the feature names
    start, end: int
        how many titles to extract
    title_transform_func: function
        funtion that accepts the title and transforms it
        into some badly capitalized version
    exclude_labels: iterable of str
        labels that we don't consider

    Returns
    ------------
    Generator of str:
        each str is one sentence, each line in the str is one token in the sent
        
    """
    feature_names += ['y']  # add the label feature name
    malform_data_dir = Path(malform_data_dir)

    # take care of this ["tickerSymbol",["NYSE","SKT"]]
    # /cs/taatto/home/hxiao/capitalization-recovery/corpus/puls-format-capitalized/3987E0BD03749C996A04B881079AD753.auxil
    clean_tag = (lambda t: t[0] if isinstance(t, list) else t)
    get_tokens = partial(map, partial(get_in, ['token']))
    get_tags = partial(map, compose(clean_tag,
                                    partial(get_in, ['pos'])))
    get_lemmas = partial(map, partial(get_in, ['lemma']))

    n_collected = 0

    for i, id_ in enumerate(ids):
        if i < start:
            continue
            
        if i % 1000 == 0:
            logger.info("Collected %d" % n_collected)
            logger.info("Finished %d" % i)

        if end is not None and i >= end:
            logger.info("Reached %d. Terminate." % end)
            break

        try:
            malform_auxil_path = (malform_data_dir /
                                  Path(id_)).with_suffix('.auxil')
            with malform_auxil_path.open(encoding='utf8') as f:
                logger.debug('processing: {}'.format(id_))
                # to get the last line
                lines = f.readlines()
                if len(lines) == 0:
                    raise EmptyFileError('auxil file empty: {}'.format(malform_auxil_path))

                l = lines[-1]
                    
                data = json.loads(l.strip())

                okform_auxil_path = str((okform_data_dir /
                                         Path(id_)).with_suffix('.auxil'))
                okform_paf_path = str((okform_data_dir /
                                       Path(id_)).with_suffix('.paf'))

                good_title_sents, body_sents = separate_title_from_body(
                    okform_auxil_path,
                    okform_paf_path
                )

                # extract the tokens
                doc = [[t['token'] for t in sent['features']]
                       for sent in body_sents]

                good_title_sents = list(good_title_sents)

                bad_title_sents = data['sents']
                if not isinstance(bad_title_sents, list):
                    raise InvalidTitleError(
                        'bad_title_sents not a list: {}'.format(
                            bad_title_sents)
                    )

                # we only consider headline that contains only ONE sentence
                if (len(good_title_sents) == 1 and
                    len(bad_title_sents) == 1):
                    good_sent = good_title_sents[0]
                    bad_sent = bad_title_sents[0]
                    good_title_tokens = get_tokens(good_sent['features'])
                    bad_title_tokens = get_tokens(bad_sent['features'])

                    # some validity checking
                    if len(good_title_tokens) != len(bad_title_tokens):
                        raise TitleInconsistencyError('{}\n{}'.format(
                            good_title_tokens, bad_title_tokens)
                        )

                    good_title_tokens_lower = map(lambda s: s.lower(),
                                                  good_title_tokens)
                    bad_title_tokens_lower = map(lambda s: s.lower(),
                                                 bad_title_tokens)
                    if (good_title_tokens_lower != bad_title_tokens_lower):
                            raise TitleInconsistencyError('{}\n{}'.format(
                                good_title_tokens_lower,
                                bad_title_tokens_lower)
                            )

                    tags = get_tags(bad_sent['features'])
                    lemmas = get_lemmas(bad_sent['features'])

                    # tag validity checking
                    for tag in tags:
                        if not (tag is None or
                                isinstance(tag, basestring)):
                            raise InvalidTitleError(
                                '{}: tag {} not string'.format(id_, tag)
                            )

                    # get malformed title tokens
                    words = convert_to_trainable_format(
                        good_title_tokens,
                        title_transform_func,
                        extractor,
                        doc=doc,
                        pos=tags,
                        lemma=lemmas
                    )

                    # format the features in the required form
                    res = unicode()
                    for i, word in enumerate(words):
                        if (i not in exclude_word_positions
                            and exclude_labels
                            and word['y'] not in exclude_labels):
                            word_feature_str = u'\t'.join(
                                [unicode(word[feature_name])
                                 for feature_name in feature_names])
                            res += word_feature_str + '\n'
                    n_collected += 1
                    yield id_, res
                else:
                    raise TitleInconsistencyError(
                        '# of title sentences more than 1: {}'.format(id_)
                    )
        except (IOError, TitleInconsistencyError,
                InvalidTitleError, EmptyFileError):
            logger.debug(traceback.format_exc())
            continue
        except:
            logger.error(traceback.format_exc())
            continue
def eval_rule_based(output_path, okform_dir,
                    accepted_labels=set(['AL', 'IC']),
                    print_errors=False):
    """
    Return:
    numpy.ndarray: (#label, 3)
    count of #match, #mode, #ref for each label
    
    First word of sentence is ignored
    """
    ret_stat = np.zeros((len(accepted_labels), 3),
                        dtype=np.float64)
    
    n_finished = 0
    n_errorless = 0
    
    with Path(output_path).open('r', encoding='utf8') as prediction_file:
        while True:
            if n_finished % 1000 == 0:
                logger.info('Finished {}/{}'.format(n_errorless, n_finished))
                
            line1 = prediction_file.readline()
            line2 = prediction_file.readline()

            if not line2:
                break

            try:
                id_ = line1.strip()
                pred_json = json.loads(line2.strip())

                if pred_json['resultingHeadline'] is None:
                    continue

                pred_tokens = pred_json['resultingHeadline']
                
                auxil_path = str(Path(okform_dir) /
                                 Path(id_).with_suffix('.auxil'))
                paf_path = str(Path(okform_dir) /
                               Path(id_).with_suffix('.paf'))
                
                title_sents, _ = separate_title_from_body(auxil_path, paf_path)
                
                true_tokens = [item['token']
                               for item in title_sents[0]['features']]
                
                if is_consistent_prediction(pred_tokens, true_tokens):
                    stat = eval_stat(pred_tokens, true_tokens,
                                     accepted_labels)
                    if print_errors:
                        print_label_error(true_tokens,
                                          # we don't have features here
                                          features=None,
                                          instance_id=id_,
                                          excluded_indices=set([0]),
                                          correct_labels=map(get_label,
                                                             true_tokens),
                                          predicted_labels=map(get_label,
                                                               pred_tokens),
                                          target_true_label='IC',
                                          target_pred_label='AL',
                                          print_features=False)
                    ret_stat += stat
                    n_errorless += 1
                else:
                    logger.debug(
                        'Predicted and true tokens inconsisent:\n{}\n{}\n'.format(
                            pred_tokens, true_tokens)
                    )
            except:
                logger.error(traceback.format_exc())
                continue
            finally:
                n_finished += 1

    return ret_stat
Пример #6
0
def eval_rule_based(output_path,
                    okform_dir,
                    accepted_labels=set(['AL', 'IC']),
                    print_errors=False):
    """
    Return:
    numpy.ndarray: (#label, 3)
    count of #match, #mode, #ref for each label
    
    First word of sentence is ignored
    """
    ret_stat = np.zeros((len(accepted_labels), 3), dtype=np.float64)

    n_finished = 0
    n_errorless = 0

    with Path(output_path).open('r', encoding='utf8') as prediction_file:
        while True:
            if n_finished % 1000 == 0:
                logger.info('Finished {}/{}'.format(n_errorless, n_finished))

            line1 = prediction_file.readline()
            line2 = prediction_file.readline()

            if not line2:
                break

            try:
                id_ = line1.strip()
                pred_json = json.loads(line2.strip())

                if pred_json['resultingHeadline'] is None:
                    continue

                pred_tokens = pred_json['resultingHeadline']

                auxil_path = str(
                    Path(okform_dir) / Path(id_).with_suffix('.auxil'))
                paf_path = str(
                    Path(okform_dir) / Path(id_).with_suffix('.paf'))

                title_sents, _ = separate_title_from_body(auxil_path, paf_path)

                true_tokens = [
                    item['token'] for item in title_sents[0]['features']
                ]

                if is_consistent_prediction(pred_tokens, true_tokens):
                    stat = eval_stat(pred_tokens, true_tokens, accepted_labels)
                    if print_errors:
                        print_label_error(
                            true_tokens,
                            # we don't have features here
                            features=None,
                            instance_id=id_,
                            excluded_indices=set([0]),
                            correct_labels=map(get_label, true_tokens),
                            predicted_labels=map(get_label, pred_tokens),
                            target_true_label='IC',
                            target_pred_label='AL',
                            print_features=False)
                    ret_stat += stat
                    n_errorless += 1
                else:
                    logger.debug(
                        'Predicted and true tokens inconsisent:\n{}\n{}\n'.
                        format(pred_tokens, true_tokens))
            except:
                logger.error(traceback.format_exc())
                continue
            finally:
                n_finished += 1

    return ret_stat