예제 #1
0
def _get_mention_splits(doc, mention, seek, span):
  mention_start_seek_offset = _.index_of(doc[seek:], mention)
  mention_start_sentence_offset = seek - span[0] + mention_start_seek_offset
  to_idx = mention_start_sentence_offset + len(mention)
  sentence = doc[span[0]:span[1]]
  return ([parse_for_tokens(sentence[:mention_start_sentence_offset] + mention),
           parse_for_tokens(mention + sentence[to_idx:])],
          span[0] + to_idx)
예제 #2
0
def check_overlap(ranks_1, ranks_2):
    agree_ctr = 0
    num_combos = 0
    for ranks_1, ranks_2 in zip(ranks_1, ranks_2):
        for doc_1, doc_2 in combinations(ranks_1, 2):
            num_combos += 1
            d_1_in_2 = _.index_of(ranks_2[:len(ranks_1)], doc_1)
            d_2_in_2 = _.index_of(ranks_2[:len(ranks_1)], doc_2)
            d_1_in_1 = _.index_of(ranks_1, doc_1)
            d_2_in_1 = _.index_of(ranks_1, doc_2)
            if d_1_in_2 == -1: continue
            if d_2_in_2 == -1:
                agree_ctr += 1
                continue
            if (d_1_in_1 < d_2_in_1) == (d_1_in_2 < d_2_in_2):
                agree_ctr += 1
    return agree_ctr, num_combos
예제 #3
0
def adult():
    feature_names = [
        'age', 'workclass', 'fnlwgt', 'education', 'education-num',
        'marital-status', 'occupation', 'relationship', 'race', 'sex',
        'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'
    ]
    cat_mapping = {
        'workclass': [
            'Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov',
            'Local-gov', 'State-gov', 'Without-pay', 'Never-worked'
        ],
        'education': [
            'Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school',
            'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th', '12th', 'Masters',
            '1st-4th', '10th', 'Doctorate', '5th-6th', 'Preschool'
        ],
        'marital-status': [
            'Married-civ-spouse', 'Divorced', 'Never-married', 'Separated',
            'Widowed', 'Married-spouse-absent', 'Married-AF-spouse'
        ],
        'occupation': [
            'Tech-support', 'Craft-repair', 'Other-service', 'Sales',
            'Exec-managerial', 'Prof-specialty', 'Handlers-cleaners',
            'Machine-op-inspct', 'Adm-clerical', 'Farming-fishing',
            'Transport-moving', 'Priv-house-serv', 'Protective-serv',
            'Armed-Forces'
        ],
        'relationship': [
            'Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative',
            'Unmarried'
        ],
        'race': [
            'White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other',
            'Black'
        ],
        'sex': ['Female', 'Male'],
        'native-country': [
            'United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada',
            'Germany', 'Outlying-US(Guam-USVI-etc)', 'India', 'Japan',
            'Greece', 'South', 'China', 'Cuba', 'Iran', 'Honduras',
            'Philippines', 'Italy', 'Poland', 'Jamaica', 'Vietnam', 'Mexico',
            'Portugal', 'Ireland', 'France', 'Dominican-Republic', 'Laos',
            'Ecuador', 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala',
            'Nicaragua', 'Scotland', 'Thailand', 'Yugoslavia', 'El-Salvador',
            'Trinadad&Tobago', 'Peru', 'Hong', 'Holand-Netherlands'
        ]
    }
    df = pd.read_csv('./data/adult.data',
                     header=None,
                     names=feature_names + ['target'])
    for col_name, options in cat_mapping.items():
        df[col_name] = df[col_name].apply(
            lambda x: _.index_of(options, x.strip()))
        df[col_name] = df[col_name].apply(lambda x: x if
                                          (x != -1) else max(df[col_name]) + 1)
    target_names = ['<=50K', '>50K']
    df.target = df.target.apply(lambda x: '>' in x)
    return df, feature_names, cat_mapping, target_names
예제 #4
0
def _get_splits(documents, mentions):
  all_splits = []
  doc_sentence_spans = [parse_for_sentence_spans(doc) for doc in documents]
  mention_idx = 0
  for doc, spans in zip(documents, doc_sentence_spans):
    seek = 0
    while mention_idx < len(mentions):
      mention = mentions[mention_idx]
      mention_start_offset = _.index_of(doc[seek:], mention)
      if mention_start_offset == -1:
        mention_start_offset = _.index_of(doc[seek:], re.sub(' +', ' ', ' , '.join(' . '.join(mention.split('.')).split(','))).replace('D . C .', 'D.C.'))
        if mention_start_offset == -1: break
      mention_start_idx = mention_start_offset + seek
      mention_end_idx = mention_start_idx + len(mention)
      span = _create_span(spans, mention_start_idx, mention_end_idx)
      splits, seek = _get_mention_splits(doc, mention, seek, span)
      all_splits.append(splits)
      mention_idx += 1
  return all_splits
예제 #5
0
def test_index_of(case, value, from_index, expected):
    assert _.index_of(case, value, from_index) == expected
예제 #6
0
def _get_mention_sentence(doc, mention, seek, span):
    mention_start_seek_offset = _.index_of(doc[seek:], mention)
    mention_start_sentence_offset = seek - span[0] + mention_start_seek_offset
    to_idx = mention_start_sentence_offset + len(mention)
    sentence = doc[span[0]:span[1]]
    return (parse_for_tokens(sentence), span[0] + to_idx)