Exemplo n.º 1
0
    def find_pair(
        self, left_df: pd.DataFrame, right_df: pd.DataFrame
    ) -> typing.Tuple[pd.DataFrame, typing.List[tuple]]:
        class left(rltk.AutoGeneratedRecord):
            pass

        class right(rltk.AutoGeneratedRecord):
            pass

        left_df['id'] = left_df.index.astype(str)
        right_df['id'] = right_df.index.astype(str)
        if 'Unnamed: 0' in right_df.columns:
            right_df = right_df.drop(columns=['Unnamed: 0'])
        ds1 = rltk.Dataset(rltk.DataFrameReader(left_df), record_class=left)
        ds2 = rltk.Dataset(rltk.DataFrameReader(right_df), record_class=right)

        bg = rltk.HashBlockGenerator()
        block = bg.generate(
            bg.block(ds1, property_=self.join_target_column_names[0]),
            bg.block(ds2, property_=self.join_target_column_names[1]))

        pairs = rltk.get_record_pairs(ds1, ds2, block=block)

        pairs_list = []
        pairs_column = [[] for _ in range(right_df.shape[0])]
        for r1, r2 in pairs:
            pairs_column[int(r2.id)].append(int(r1.id))
            pairs_list.append((r1.id, r2.id))

        right_df["joining_pairs"] = pairs_column
        return right_df, pairs_list
def worker():
    tokenizer = rltk.CrfTokenizer()

    # load Datasets
    ds_imdb = rltk.Dataset(reader=rltk.JsonLinesReader(imdb_file),
                           record_class=IMDBRecord,
                           adapter=rltk.MemoryKeyValueAdapter())
    ds_afi = rltk.Dataset(reader=rltk.JsonLinesReader(afi_file),
                          record_class=AFIRecord,
                          adapter=rltk.MemoryKeyValueAdapter())
    valid_match = []
    for r_imdb in ds_imdb:
        # test this record with AFI records
        optimum = (None, MY_TRESH)
        for r_afi in ds_afi:
            result, confidence = rule_based_method(r_imdb, r_afi)
            if result and confidence > optimum[1]:
                optimum = (r_afi, confidence)

        if optimum[0] is not None:
            r_afi, confidence = optimum
            valid_match.append({
                'imdb_movie': r_imdb.raw_object['url'],
                'afi_movie': r_afi.raw_object['url']
            })
        else:
            valid_match.append({
                'imdb_movie': r_imdb.raw_object['url'],
                'afi_movie': None
            })

    fout = open(result_file, 'w')
    fout.write(json.dumps(valid_match, indent=4))
    fout.close()
Exemplo n.º 3
0
def get_datasets_from_csvs(csv_path_1, record_1, csv_path_2, record_2):
    dataset_1 = rltk.Dataset(reader=rltk.CSVReader(csv_path_1),
                             record_class=record_1,
                             adapter=rltk.MemoryAdapter())
    dataset_2 = rltk.Dataset(reader=rltk.CSVReader(csv_path_2),
                             record_class=record_2,
                             adapter=rltk.MemoryAdapter())

    return dataset_1, dataset_2
Exemplo n.º 4
0
def featurize(mode, output_filename=None):
    """
    Catch all method to featurize either train or test dataset and save to CSV

    Params:
        mode: (str) TRAIN or TEST
        output_filename: (str) Optional- name of the csv to save the data
    """
    MODE = mode
    if not os.path.exists('train/') or not os.path.exists('test/'):
        train_test_split()
        
    if not os.path.exists('block_files/'):
        os.mkdir('block_files/')

    BLOCK_FILE = 'block_files/'+MODE+'.jl'
    CORPUS_FREQ_FILE = MODE+'/corpus_freq.json'

    ds_amzn = rltk.Dataset(reader=rltk.CSVReader(open(MODE + '/Amazon.csv', encoding='latin-1')),
                    record_class=AmazonRecord, adapter=rltk.MemoryAdapter())

    ds_goog = rltk.Dataset(reader=rltk.CSVReader(open(MODE + '/GoogleProducts.csv', encoding='latin-1')),
                    record_class=GoogleRecord, adapter=rltk.MemoryAdapter())

    try:
        block_handler = open(BLOCK_FILE,'r')
        print("Block file exists. Reading from disk...")
    except FileNotFoundError:
        block_handler = rltk.InvertedIndexBlockGenerator(
            ds_amzn, ds_goog, writer=rltk.BlockFileWriter(BLOCK_FILE), tokenizer=tokenizer).generate()

    features = ['id1', 'id2', 'price_difference',
       'desc_jaccard', 'desc_tf_idf', 'desc_trigram',
       'manufacturer_jaccard', 'manufacturer_jaro_winkler',
       'manufacturer_levenshtien', 'name_jaccard', 'name_jaro_winkler',
       'name_trigram','label']

    pairs = rltk.get_record_pairs(ds_amzn, ds_goog, rltk.BlockFileReader(block_handler))
    freq = get_document_frequency(CORPUS_FREQ_FILE, ds_amzn, ds_goog)

    if MODE == "train":
        print("Featurizing train")
        if not output_filename:
            output_filename = 'train/features_train.csv'
        featurize_all_records(pairs, features, output_filename, freq, TRAIN_DOC_SIZE)
    elif MODE == "test":
        print("Featurizing test")
        if not output_filename:
            output_filename = 'test/features_test.csv'
        featurize_all_records(pairs, features, output_filename, freq, TEST_DOC_SIZE)
def entity_links_stage_4():
    # load Datasets
    ds_issue_location = rltk.Dataset(reader=rltk.JsonLinesReader('ISSUE_LOCATIONS_DICT.jl'), record_class=LocationRecord, adapter=rltk.MemoryAdapter())
    ds_wikia_location = rltk.Dataset(reader=rltk.JsonLinesReader('WIKIA_LOCATIONS_DICT.jl'), record_class=LocationRecord, adapter=rltk.MemoryAdapter())
    # print some entries
    print(ds_issue_location.generate_dataframe().head(5))
    print(ds_wikia_location.generate_dataframe().head(5))
    tot_counter = 0
    for item in ds_issue_location:
        tot_counter += 1
        res_id, res_conf = match_record_to_ds(item, ds_wikia_location, False)
        if res_id != None:
            print('[%003d]: [%s] ---%03.02f%%--- [%s]' % (tot_counter, item.id, res_conf*100, res_id))
            SIM_LOCATIONS__ISSUE_TO_WIKIA[item.id] = (res_id, res_conf)
    with open('SIM_LOCATIONS__ISSUE_TO_WIKIA.json', 'w') as outfile:
        print('SIM_LOCATIONS__ISSUE_TO_WIKIA: ' + str(len(SIM_LOCATIONS__ISSUE_TO_WIKIA)))
        json.dump(SIM_LOCATIONS__ISSUE_TO_WIKIA, outfile, indent=2)
def entity_links_stage_1():
    # load Datasets
    ds_movie_char = rltk.Dataset(reader=rltk.JsonLinesReader('MOVIE_CHARS_DICT.jl'), record_class=MovieCharRecord, adapter=rltk.MemoryAdapter())
    ds_wikia_char = rltk.Dataset(reader=rltk.JsonLinesReader('WIKIA_CHARS_DICT.jl'), record_class=WikiaCharRecord, adapter=rltk.MemoryAdapter())
    # print some entries
    print(ds_movie_char.generate_dataframe().head(5))
    print(ds_wikia_char.generate_dataframe().head(5))
    tot_counter = 0
    for item in ds_movie_char:
        tot_counter += 1
        res_id, res_conf = match_record_to_ds(item, ds_wikia_char)
        if res_id != None:
            print('[%003d]: [%s] ---%03.02f%%--- [%s]' % (tot_counter, item.id, res_conf*100, res_id))
            SIM_CHARS__MOVIE_TO_WIKIA[item.id] = (res_id, res_conf)
    with open('SIM_CHARS__MOVIE_TO_WIKIA.json', 'w') as outfile:
        print('SIM_CHARS__MOVIE_TO_WIKIA: ' + str(len(SIM_CHARS__MOVIE_TO_WIKIA)))
        json.dump(SIM_CHARS__MOVIE_TO_WIKIA, outfile, indent=2)
Exemplo n.º 7
0
    def join(self, left_df: pd.DataFrame, right_df: pd.DataFrame,
             left_columns: typing.List[typing.List[int]],
             right_columns: typing.List[typing.List[int]], left_metadata: dict,
             right_metadata: dict) -> JoinResult:
        class left(rltk.AutoGeneratedRecord):
            pass

        class right(rltk.AutoGeneratedRecord):
            pass

        left_df['id'] = left_df.index.astype(str)
        right_df['id'] = right_df.index.astype(str)
        if 'Unnamed: 0' in right_df.columns:
            right_df = right_df.drop(columns=['Unnamed: 0'])
        ds1 = rltk.Dataset(rltk.DataFrameReader(left_df), record_class=left)
        ds2 = rltk.Dataset(rltk.DataFrameReader(right_df), record_class=right)

        bg = rltk.HashBlockGenerator()
        block = bg.generate(
            bg.block(ds1, property_=self.join_target_column_names[0]),
            bg.block(ds2, property_=self.join_target_column_names[1]))
        left_df = left_df.set_index('id')
        right_df = right_df.set_index('id')

        pairs = rltk.get_record_pairs(ds1, ds2, block=block)

        df_joined = pd.DataFrame()

        column_names_to_join = None
        for r1, r2 in pairs:
            left_res = left_df.loc[r1.id]
            right_res = right_df.loc[r2.id]
            if column_names_to_join is None:
                column_names_to_join = right_res.index.difference(
                    left_res.index)
                matched_rows = right_res.index.intersection(left_res.index)
                columns_new = left_res.index.tolist()
                columns_new.extend(column_names_to_join.tolist())
            new = pd.concat([left_res, right_res[column_names_to_join]])
            df_joined = df_joined.append(new, ignore_index=True)
        # ensure that the original dataframe columns are at the first left part
        df_joined = df_joined[columns_new]

        return JoinResult(df_joined, matched_rows)
Exemplo n.º 8
0
    @rltk.cached_property
    def brand(self):
        return set(self.raw_object['brand'])

    @rltk.cached_property
    def ingredients(self):
        return set(self.raw_object['ingredients_ids'])


product_file = './output/sephora_skincare_product_ingredient_list.jl'
with open(product_file) as json_products:
    products = [json.loads(line) for line in json_products]

ds_products = rltk.Dataset(reader=rltk.JsonLinesReader(product_file),
                           record_class=Product,
                           adapter=rltk.MemoryKeyValueAdapter())
df_products = ds_products.generate_dataframe()


def name_token_similarity(prod1, prod2):
    '''set'''
    set1 = prod1.name_tokens
    set2 = prod2.name_tokens
    return rltk.dice_similarity(set1, set2)


def name_string_similarity(prod1, prod2):
    s1 = prod1.name_string
    s2 = prod2.name_string
    return rltk.jaro_winkler_similarity(s1, s2)
Exemplo n.º 9
0
def create_dataset(input_file: str, rcrd_class: rltk.Record) -> rltk.Dataset:
    ''' Create rltk dataset from a given jl file '''
    assert Path(input_file).suffix == ".jl"
    return rltk.Dataset(reader=rltk.JsonLinesReader(input_file),
                        record_class=rcrd_class,
                        adapter=rltk.MemoryKeyValueAdapter())
Exemplo n.º 10
0
def process():

    df_entity = pd.DataFrame()

    logger.info('loading entity dataframes')
    for infile in glob.glob(
            os.path.join(config['temp_dir'], config['run_name'],
                         '*/*.entity.h5')):
        source = os.path.basename(infile).split('.')[0]
        df_entity = df_entity.append(pd.read_hdf(infile))
    df_entity = df_entity.reset_index(drop=True)
    logger.info('Total number of entities: %d', len(df_entity))
    df_entity['type'] = df_entity['type'].apply(lambda x: x[
        0])  # only pick the fist type (compatible with old pipeline)
    df_entity_ori = df_entity.copy()

    ### filtering
    logger.info('filtering out some entity types')
    all_types = set(df_entity['type'])
    # all_types = set([t for tu in df_entity['type'] for t in tu])  # multi-type support
    selected_types = filter(
        lambda x: x.startswith(
            ('ldcOnt:GPE', 'ldcOnt:LOC', 'ldcOnt:ORG', 'ldcOnt:PER')),
        all_types)
    df_entity = df_entity.loc[df_entity['type'].isin(selected_types)]
    # df_entity = df_entity.loc[[any([t in selected_types for t in tu]) for tu in df_entity['type']]] # multi-type support
    df_entity = df_entity[df_entity['name'].notnull()]
    df_entity = df_entity.where(pd.notnull(df_entity), None)
    df_entity_left = df_entity_ori[~df_entity_ori['e'].isin(df_entity['e'])]

    ### generate rltk components
    logger.info('generating rltk components')
    ds = rltk.Dataset(reader=rltk.DataFrameReader(df_entity),
                      record_class=GaiaRecord)
    bg_kb = rltk.TokenBlocker()
    blocks_kb = bg_kb.block(ds,
                            function_=lambda r: list(r.selected_target)
                            if r.selected_target else ['None'])
    bg_fb = rltk.TokenBlocker()
    blocks_fb = bg_fb.block(ds,
                            function_=lambda r: r.selected_fbid
                            if r.selected_fbid else ['None'])

    ### clustering
    logger.info('clustering entity')
    # build cluster based on type
    all_clusters = []
    for bid, data in blocks_kb.key_set_adapter:
        if bid == 'None':
            continue

        c = Cluster(ds)
        for _, r_id in data:
            r = ds.get_record(r_id)
            for id_ in r.selected_target:
                c.kb_id.add(id_)
            if r.fbid:
                for id_ in r.selected_fbid:
                    c.fb_id.add(id_)
            if r.wikidata:
                for id_ in r.selected_wikidata:
                    c.wd_id.add(id_)
            c.add(r)
        all_clusters.append(c)

    # fb only clusters
    fb_only_clusters = {}
    for bid, data in blocks_fb.key_set_adapter:
        if bid == 'None':
            continue

        fb_only_clusters[bid] = set()
        for _, r_id in data:
            r = ds.get_record(r_id)
            if r.selected_target:
                continue
            fb_only_clusters[bid].add(r_id)
        if len(fb_only_clusters[bid]) == 0:
            del fb_only_clusters[bid]

    for bid, cluster in fb_only_clusters.items():
        c = Cluster(ds)
        for r_id in cluster:
            c.add(r_id)
            r = ds.get_record(r_id)
            if r.fbid:
                for id_ in r.selected_fbid:
                    c.fb_id.add(id_)
            if r.wikidata:
                for id_ in r.selected_wikidata:
                    c.wd_id.add(id_)
        all_clusters.append(c)

    # validation
    for idx, c in enumerate(all_clusters):
        if len(c.kb_id) > 1:
            logger.error('mulitple kb_ids in cluster: %s', c.kb_id)
            break

        kb_ids = set()
        for r_id in c.all_records:
            r = ds.get_record(r_id)
            if r.selected_target:
                for id_ in r.selected_target:
                    kb_ids.add(id_)
        if len(kb_ids) > 1:
            logger.error('mulitple kb_ids in cluster: %s', kb_ids, c.kb_id)
            break

    # split based on types
    all_clusters_splitted = []
    for c in all_clusters:
        types = {}
        for r_id in c.all_records:
            r = ds.get_record(r_id)
            type_ = normalize_type(r.type)
            if type_ not in types:
                cc = Cluster(ds)
                cc.type = type_
                types[type_] = cc

            cc = types[type_]
            cc.add(r_id)
            if r.selected_target:
                for id_ in r.selected_target:
                    cc.kb_id.add(id_)
            if r.selected_fbid:
                for id_ in r.selected_fbid:
                    cc.fb_id.add(id_)
            if r.selected_wikidata:
                for id_ in r.selected_wikidata:
                    cc.wd_id.add(id_)
        for cc in types.values():
            all_clusters_splitted.append(cc)

    # merge singleton
    final_clusters = deepcopy(all_clusters_splitted)
    MIN_SIM = 0.4
    clustered_entity_ids = set(
        [r for c in all_clusters for r in c.all_records])

    for _, e in df_entity['e'].items():
        if e not in clustered_entity_ids:
            r = ds.get_record(e)
            r_type = normalize_type(r.type)
            local_best = [None,
                          0]  # first item: cluster id, second item: score
            for c in final_clusters:
                sim = c.similarity(r)
                if r_type != c.type:
                    continue
                if sim >= MIN_SIM:
                    if sim > local_best[1]:
                        local_best = [c, sim]

            c = local_best[0]
            if c is not None:
                c.add(r, contribute=False)
            else:
                # still singleton, construct singleton cluster
                c = Cluster(ds)
                c.type = r_type
                c.add(r)
                final_clusters.append(c)

    # filtered-out entities
    # create cluster with fake record
    for _, e in df_entity_left.iterrows():
        c = Cluster(None)
        c.type = normalize_type(e['type'])
        c.add(e['e'], contribute=False)
        final_clusters.append(c)
    logger.info('Total number of clusters: %d', len(final_clusters))

    # create entity to cluster mapping
    entity_to_cluster = defaultdict(list)
    for c in final_clusters:
        for r in c.all_records:
            entity_to_cluster[r].append(c)
    for e, c in entity_to_cluster.items():
        if len(c) > 1:
            logger.error('Entity in multiple clusters detected, entity id: %s',
                         e)

    ### generate cluster properties
    logger.info('generating cluster properties')
    for c in final_clusters:
        c.generate()

    ### export
    logger.info('exporting clusters')
    df_entity_cluster = df_entity_ori.copy()
    df_entity_cluster['cluster'] = None
    df_entity_cluster['synthetic'] = False

    logger.info('updating cluster info for each entity')
    for idx, e in df_entity_cluster['e'].items():
        clusters = tuple(set([c.full_id for c in entity_to_cluster[e]]))
        df_entity_cluster.at[idx, 'cluster'] = clusters

    logger.info('creating prototypes')
    proto_to_cluster_mapping = {}
    for c in final_clusters:
        proto_to_cluster_mapping[c.feature_entity_id] = c
    proto_dict = []
    for idx, row in df_entity_cluster.iterrows():
        eid = row['e']
        if eid not in proto_to_cluster_mapping:
            # not a prototype
            continue
        c = proto_to_cluster_mapping[eid]
        # p = df_entity_ori[df_entity_ori['e'] == c.feature_entity_id].iloc[0]
        row = row.to_dict()
        row['synthetic'] = True
        row['cluster'] = tuple([c.full_id])
        row['e'] = c.prototype
        proto_dict.append(row)
    df_prototypes = pd.DataFrame.from_dict(proto_dict)

    logger.info('appending dataframes')
    df_complete_entity_clusters = df_entity_cluster.append(df_prototypes)
    df_complete_entity_clusters.reset_index(drop=True)

    logger.info('writing to disk')
    output_file = os.path.join(config['temp_dir'], config['run_name'],
                               'entity_cluster.h5')
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        df_complete_entity_clusters.to_hdf(output_file,
                                           'entity',
                                           mode='w',
                                           format='fixed')
        df_complete_entity_clusters.to_csv(output_file + '.csv')
Exemplo n.º 11
0
    @property
    def address(self):
        return self.raw_object['Address']

    @property
    def phone(self):
        return self.raw_object['Phone']

    @property
    def cuisine(self):
        return self.raw_object['Cuisine']


ds1 = rltk.Dataset(reader=rltk.DataFrameReader(ds1),
                   record_class=Record1,
                   adapter=rltk.MemoryKeyValueAdapter())
ds2 = rltk.Dataset(reader=rltk.DataFrameReader(ds2),
                   record_class=Record2,
                   adapter=rltk.MemoryKeyValueAdapter())
'''bg = rltk.HashBlockGenerator()
blocks = bg.generate(bg.block(ds1, property_='cuisine'), bg.block(ds2, property_='cuisine'))
pairs = rltk.get_record_pairs(ds1, ds2, block=blocks)'''

pairs = rltk.get_record_pairs(ds1, ds2)

f = open('similarities.txt', 'w+')

for r1, r2 in pairs:

    a_d = rltk.levenshtein_similarity(r1.address, r2.address)
Exemplo n.º 12
0
    def genre_set(self):
        return set(self.genre_string.split(','))

    @rltk.cached_property
    def year(self):
        if re.search("(\d{4})", self.date_string):
            return str(re.search("(\d{4})", self.date_string).group(0))
        else:
            return ''


imdb_file = 'imdb.jl'
afi_file = 'afi.jl'

ds_imdb = rltk.Dataset(reader=rltk.JsonLinesReader(imdb_file),
                       record_class=IMDBRecord,
                       adapter=rltk.MemoryKeyValueAdapter())
ds_afi = rltk.Dataset(reader=rltk.JsonLinesReader(afi_file),
                      record_class=AFIRecord,
                      adapter=rltk.MemoryKeyValueAdapter())


def name_similarity(r_imdb, r_afi):
    s1 = r_imdb.name_string
    s2 = r_afi.name_string
    return rltk.jaro_winkler_similarity(s1, s2)


def genre_similarity(r_imdb, r_afi):
    s1 = r_imdb.genre_set
    s2 = r_afi.genre_set
Exemplo n.º 13
0
def process():

    df_entity = pd.DataFrame()
    df_event = pd.DataFrame()
    df_event_role = pd.DataFrame()
    df_relation = pd.DataFrame()
    df_relation_role = pd.DataFrame()

    logger.info('loading entity dataframes')
    for infile in glob.glob(
            os.path.join(config['temp_dir'], config['run_name'],
                         '*/*.entity.h5')):
        source = os.path.basename(infile).split('.')[0]
        # entity
        df_entity = df_entity.append(pd.read_hdf(infile))
        # event
        event_file = infile[:-len('entity.h5')] + 'event.h5'
        df_event = df_event.append(pd.read_hdf(event_file))
        event_role_file = infile[:-len('entity.h5')] + 'event_role.h5'
        df_event_role = df_event_role.append(pd.read_hdf(event_role_file))
        # relation
        relation_file = infile[:-len('entity.h5')] + 'relation.h5'
        df_relation = df_relation.append(pd.read_hdf(relation_file))
        relation_role_file = infile[:-len('entity.h5')] + 'relation_role.h5'
        df_relation_role = df_relation_role.append(
            pd.read_hdf(relation_role_file))
    logger.info('Read in {} entities, {} events, {} relations'.format(
        len(df_entity), len(df_event), len(df_relation)))
    df_entity = df_entity.drop_duplicates(
        subset=['e'],
        keep='last')  # cmu data has cross document entities, only keep one
    df_entity = df_entity.reset_index(drop=True)
    df_entity['type'] = df_entity['type'].apply(lambda x: x[
        0])  # only pick the fist type (compatible with old pipeline)
    df_entity_ori = df_entity.copy()
    df_event = df_event.drop_duplicates(subset=['e'],
                                        keep='last').reset_index(drop=True)
    df_event_role = df_event_role.drop_duplicates().reset_index(drop=True)
    df_relation = df_relation.drop_duplicates().reset_index(drop=True)
    df_relation_role = df_relation_role.drop_duplicates().reset_index(
        drop=True)
    logger.info(
        'After deduplication: {} entities, {} events, {} relations'.format(
            len(df_entity), len(df_event), len(df_relation)))

    ### filtering
    logger.info('filtering out some entity types')
    all_types = set(df_entity['type'])
    # all_types = set([t for tu in df_entity['type'] for t in tu])  # multi-type support
    selected_types = filter(
        lambda x: x.startswith(
            ('ldcOnt:GPE', 'ldcOnt:LOC', 'ldcOnt:ORG', 'ldcOnt:PER')),
        all_types)
    df_entity = df_entity.loc[df_entity['type'].isin(selected_types)]
    # df_entity = df_entity.loc[[any([t in selected_types for t in tu]) for tu in df_entity['type']]] # multi-type support
    df_entity = df_entity[df_entity['name'].notnull()]
    df_entity = df_entity.where(pd.notnull(df_entity), None)
    df_entity_left = df_entity_ori[~df_entity_ori['e'].isin(df_entity['e'])]

    ### generate rltk components
    logger.info('generating rltk components')
    ds = rltk.Dataset(reader=rltk.DataFrameReader(df_entity),
                      record_class=GaiaRecord)
    # for r in ds:
    #     print(r.concatenated_labels)
    #     print(r.name, r.target, r.wikidata, r.selected_target_index, r.selected_wikidata_index)
    bg_kb = rltk.TokenBlocker()
    blocks_kb = bg_kb.block(ds,
                            function_=lambda r: [r.selected_target]
                            if r.selected_target else ['None'])
    bg_wd = rltk.TokenBlocker()
    blocks_wd = bg_wd.block(ds,
                            function_=lambda r: [r.selected_wikidata]
                            if r.selected_wikidata else ['None'])

    ### clustering
    logger.info('clustering entity')
    # build cluster based on type
    all_clusters = []
    for bid, data in blocks_kb.key_set_adapter:
        if bid == 'None':
            continue

        c = Cluster(ds)
        for _, r_id in data:
            r = ds.get_record(r_id)
            if r.target and not c.kb_id:
                c.kb_id = r.selected_target
                c.kb_labels = set(r.selected_target_labels)
            if r.wikidata:
                if r.selected_wikidata not in c.wd_candidate:
                    c.wd_candidate[r.selected_wikidata] = set(
                        r.selected_wikidata_labels)
            c.add(r)
        c.elect_wd_id()
        all_clusters.append(c)

    # find all wd only blocks
    wd_only_clusters = {}
    for bid, data in blocks_wd.key_set_adapter:
        if bid == 'None':
            continue

        wd_only_clusters[bid] = set()
        for _, r_id in data:
            r = ds.get_record(r_id)
            if r.selected_target:
                continue
            wd_only_clusters[bid].add(r_id)
        if len(wd_only_clusters[bid]) == 0:
            del wd_only_clusters[bid]

    # if wd block overlaps with kb clusters
    for c in all_clusters:
        if c.wd_id and c.wd_id in wd_only_clusters:
            for r in wd_only_clusters[c.wd_id]:
                c.add(r)
            del wd_only_clusters[c.wd_id]

    # construct clusters based on blocks
    for bid, cluster in wd_only_clusters.items():
        c = Cluster(ds)
        for r_id in cluster:
            c.add(r_id)
            r = ds.get_record(r_id)
            if not c.wd_id:
                c.wd_id = r.selected_wikidata
                c.wd_labels = set(r.selected_wikidata_labels)
        all_clusters.append(c)

    # validation
    # for idx, c in enumerate(all_clusters):
    #     if len(c.kb_id) > 1:
    #         logger.error('mulitple kb_ids in cluster: %s', c.kb_id)
    #         break
    #
    #     kb_ids = set()
    #     for r_id in c.all_records:
    #         r = ds.get_record(r_id)
    #         if r.selected_target:
    #             for id_ in r.selected_target:
    #                 kb_ids.add(id_)
    #     if len(kb_ids) > 1:
    #         logger.error('mulitple kb_ids in cluster: %s', kb_ids, c.kb_id)
    #         break

    # split based on types
    all_clusters_splitted = []
    for c in all_clusters:
        types = {}
        for r_id in c.all_records:
            r = ds.get_record(r_id)
            type_ = normalize_type(r.type)
            if type_ not in types:
                cc = Cluster(ds)
                cc.type = type_
                types[type_] = cc

            cc = types[type_]
            cc.add(r_id)
            cc.kb_id = c.kb_id
            cc.kb_labels = c.kb_labels
            cc.wd_id = c.wd_id
            cc.wd_labels = c.wd_labels

        for cc in types.values():
            all_clusters_splitted.append(cc)

    # merge singleton
    final_clusters = deepcopy(all_clusters_splitted)
    # MIN_SIM = 0.4
    clustered_entity_ids = set(
        [r for c in all_clusters for r in c.all_records])

    for _, e in df_entity['e'].items():
        if e not in clustered_entity_ids:
            r = ds.get_record(e)
            r_type = normalize_type(r.type)
            local_best = [None,
                          0]  # first item: cluster id, second item: score
            for c in final_clusters:
                sim = c.similarity(r)
                if r_type != c.type:
                    continue
                if sim > local_best[1]:
                    local_best = [c, sim]

            c = local_best[0]
            if c is not None:
                c.add(r)
            else:
                # still singleton, construct singleton cluster
                c = Cluster(ds)
                c.type = r_type
                c.add(r)
                c.name_labels = set(r.name)
                final_clusters.append(c)

    # filtered-out entities
    # create cluster with fake record
    for _, e in df_entity_left.iterrows():
        c = Cluster(None)
        c.type = normalize_type(e['type'])
        c.add(e['e'])
        final_clusters.append(c)
    logger.info('Total number of clusters: %d', len(final_clusters))

    # create entity to cluster mapping
    entity_to_cluster = defaultdict(list)
    for c in final_clusters:
        for r in c.all_records:
            entity_to_cluster[r].append(c)
    for e, c in entity_to_cluster.items():
        if len(c) > 1:
            logger.error('Entity in multiple clusters detected, entity id: %s',
                         e)
    entity_to_cluster = {e: c[0] for e, c in entity_to_cluster.items()}

    ### generate cluster properties
    logger.info('generating cluster properties')
    for c in final_clusters:
        c.generate()

    ### event and relation cluster
    # these clusters URIs will be {event/relation uri}-cluster
    # prototype URIs hence will be just {event/relation uri}

    ### event role
    event_role_se_dict = {
        'prototype1': [],
        'prototype2': [],
        'role': [],
        'just': []
    }
    for idx, v in df_event_role.iterrows():
        event_role_se_dict['prototype1'].append(v['event'])
        event_role_se_dict['prototype2'].append(
            entity_to_cluster[v['entity']].prototype)
        event_role_se_dict['role'].append(v['role'])
        event_role_se_dict['just'].append(v['just'])
    df_event_role_se = pd.DataFrame.from_dict(event_role_se_dict)

    ### relation role
    relation_role_se_dict = {
        'prototype1': [],
        'prototype2': [],
        'role': [],
        'just': []
    }
    for idx, v in df_relation_role.iterrows():
        relation_role_se_dict['prototype1'].append(v['relation'])
        if v['type'] == 'entity':
            relation_role_se_dict['prototype2'].append(
                entity_to_cluster[v['e']].prototype)
        elif v['type'] == 'event':
            relation_role_se_dict['prototype2'].append(v['e'])
        relation_role_se_dict['role'].append(v['role'])
        relation_role_se_dict['just'].append(v['just'])
    df_relation_role_se = pd.DataFrame.from_dict(relation_role_se_dict)

    ### export
    logger.info('exporting clusters')
    df_entity_cluster = df_entity_ori.copy()
    df_entity_cluster['cluster'] = None
    df_entity_cluster['synthetic'] = False
    df_entity_cluster['cluster_member_confidence'] = None

    logger.info('updating cluster info for each entity')
    for idx, e in df_entity_cluster['e'].items():
        clusters = [entity_to_cluster[e]]
        cluster_ids = tuple([c.full_id for c in clusters])
        confidences = tuple([c.member_confidence[e] for c in clusters])
        df_entity_cluster.at[idx, 'cluster'] = cluster_ids
        df_entity_cluster.at[idx, 'cluster_member_confidence'] = confidences

    logger.info('creating prototypes')
    proto_to_cluster_mapping = {}
    for c in final_clusters:
        proto_to_cluster_mapping[c.feature_entity_id] = c
    proto_dict = []
    for idx, row in df_entity_cluster.iterrows():
        eid = row['e']
        if eid not in proto_to_cluster_mapping:
            # not a prototype
            continue
        c = proto_to_cluster_mapping[eid]
        # p = df_entity_ori[df_entity_ori['e'] == c.feature_entity_id].iloc[0]
        row = row.to_dict()
        row['synthetic'] = True
        row['cluster'] = tuple([c.full_id])
        row['e'] = c.prototype
        proto_dict.append(row)
    df_prototypes = pd.DataFrame.from_dict(proto_dict)

    logger.info('appending dataframes')
    df_complete_entity_clusters = df_entity_cluster.append(df_prototypes)
    df_complete_entity_clusters.reset_index(drop=True)

    logger.info('writing to disk')
    entity_cluster_output_file = os.path.join(config['temp_dir'],
                                              config['run_name'],
                                              'entity_cluster')
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        df_complete_entity_clusters.to_hdf(entity_cluster_output_file + '.h5',
                                           'entity',
                                           mode='w',
                                           format='fixed')
        df_complete_entity_clusters.to_csv(entity_cluster_output_file +
                                           '.h5.csv')
    with open(entity_cluster_output_file + '.cluster.jl', 'w') as f:
        for c in final_clusters:
            f.write(json.dumps(c.debug()) + '\n')

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        # event
        event_cluster_output_file = os.path.join(config['temp_dir'],
                                                 config['run_name'],
                                                 'event_cluster.h5')
        df_event.to_hdf(event_cluster_output_file, 'event')
        event_role_output_file = os.path.join(config['temp_dir'],
                                              config['run_name'],
                                              'event_role.h5')
        df_event_role_se.to_hdf(event_role_output_file, 'event_role')
        df_event_role_se.to_csv(event_role_output_file + '.csv')
        # relation
        relation_cluster_output_file = os.path.join(config['temp_dir'],
                                                    config['run_name'],
                                                    'relation_cluster.h5')
        df_relation.to_hdf(relation_cluster_output_file, 'relation')
        relation_role_output_file = os.path.join(config['temp_dir'],
                                                 config['run_name'],
                                                 'relation_role.h5')
        df_relation_role_se.to_hdf(relation_role_output_file,
                                   'relation_role',
                                   mode='w',
                                   format='fixed')
        df_relation_role_se.to_csv(relation_role_output_file + '.csv')
Exemplo n.º 14
0
import pandas as pd
import rltk

print('from dataframe...')

df = pd.read_csv('ds1.csv', encoding='latin-1')
df['id'] = df['doc_id'].astype('str')


class DFRecord(rltk.AutoGeneratedRecord):
    pass


ds = rltk.Dataset(rltk.DataFrameReader(df), record_class=DFRecord)
for r in ds:
    print(r.id, r.doc_id, r.doc_value)

print('set id column...')


@rltk.set_id('col1', function_=lambda x: str(x), keep_original=True)
class DFRecord2(rltk.AutoGeneratedRecord):
    pass


df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
ds = rltk.Dataset(reader=rltk.DataFrameReader(df), record_class=DFRecord2)
for r in ds:
    print(r.id, r.col1, r.col2)
Exemplo n.º 15
0
    @property
    def id_and_value(self):
        print('--> compute id_and_value')
        return self.id + '-' + self.value


arr = [{
    'doc_id': '1',
    'doc_value': 'a'
}, {
    'doc_id': '2',
    'doc_value': 'b'
}, {
    'doc_id': '3',
    'doc_value': 'c'
}]
# adapter = rltk.RedisKeyValueAdapter(host='127.0.0.1', key_prefix='cached_)
adapter = rltk.HBaseKeyValueAdapter(host='127.0.0.1',
                                    key_prefix='test_',
                                    table='rltk_test1')
ds1 = rltk.Dataset(reader=rltk.ArrayReader(arr),
                   record_class=Record1,
                   adapter=adapter)
for r1 in ds1:
    print('------------')
    print('id:', r1.id)
    print('value:', r1.value)
    print('id_and_value:', r1.id_and_value)
    print('cache in dict:', r1.__dict__)
Exemplo n.º 16
0
    @rltk.cached_property
    def phone(self):
        phone = self.raw_object['Phone'].replace('/', '-').replace(
            ' ', '')  #.replace('and','or').split('or')
        #         print(phone.strip()[:15])
        return phone.strip()[:15]

    @rltk.cached_property
    def cuisine(self):
        cs = self.raw_object['Cuisine']
        return cs if cs else ''


ds_fod = rltk.Dataset(rltk.CSVReader(file_F),
                      record_class=DBFod,
                      adapter=rltk.MemoryKeyValueAdapter())
# dFod = [[k+1,dblp.id,dblp.cuisine,dblp.address] for k,dblp in enumerate(ds_fod)]
# print(dFod[506])
# for r_dblp in ds_fod:
#     print(r_dblp.name)

tokenizer = rltk.CrfTokenizer()
i = 0


def tokenize_id(t):
    tokens = tokenizer.tokenize(t)
    global i
    i += 1
    t = str(i)
Exemplo n.º 17
0
 def _init_rltk_dataset(df, record_class):
     rltk_dataset = rltk.Dataset(reader=DataFrameReader(df, True),
                                 record_class=record_class)
     return rltk_dataset
Exemplo n.º 18
0
        return '4' if self.id == '1' else None


class Record2(rltk.Record):
    @rltk.cached_property
    def id(self):
        return self.raw_object['ident']

    @rltk.cached_property
    def value(self):
        v = self.raw_object.get('values', list())
        return v[0] if len(v) > 0 else 'empty'


ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv'),
                   record_class=Record1,
                   adapter=rltk.MemoryAdapter())
ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'),
                   record_class=Record2,
                   adapter=rltk.DBMAdapter('file_index'))

pairs = rltk.get_record_pairs(ds1, ds2)
for r1, r2 in pairs:
    print('-------------')
    print(r1.id, r1.value, '\t', r2.id, r2.value)
    if r1.parent_id:
        print('r1\'s parent', r1.parent_id, ds1.get_record(r1.parent_id).value)
    print('levenshtein_distance:',
          rltk.levenshtein_distance(r1.value, r2.value))
    print('levenshtein_similarity:',
          rltk.levenshtein_similarity(r1.value, r2.value))
Exemplo n.º 19
0
    return rltk.hybrid_jaccard_similarity(r_museum.name_tokens,
                                          r_ulan.name_tokens,
                                          threshold=0.67)


if __name__ == '__main__':
    ulan_ds_adapter = rltk.RedisKeyValueAdapter('127.0.0.1',
                                                key_prefix='ulan_ds_')
    bg = rltk.TokenBlockGenerator()
    ulan_block = rltk.Block(
        rltk.RedisKeySetAdapter('127.0.0.1', key_prefix='ulan_block_'))

    # pre computing for ulan data
    if rltk.cli.confirm('Regenerate ULAN data caches?', default=False):
        ds_ulan = rltk.Dataset(
            reader=rltk.JsonLinesReader('../../datasets/museum/ulan.json'),
            record_class=RecordULAN,
            adapter=ulan_ds_adapter)
        b_ulan = bg.block(ds_ulan,
                          function_=block_on_name_prefix,
                          block=ulan_block)

    # load ulan
    ds_ulan = rltk.Dataset(adapter=ulan_ds_adapter)
    b_ulan = ulan_block

    # compare against museums' data
    museums = list(
        map(lambda x: os.path.splitext(os.path.basename(x))[0],
            glob.glob('../../datasets/museum/*.json')))
    museums.remove('ulan')
    for museum in museums:
    @rltk.cached_property
    def brand_cleaned(self):
        _ = self.name_tokens
        manufacturer = self.manufacturer
        return process_brand_alias(
            manufacturer if manufacturer != '' else self.brand)

    @rltk.cached_property
    def model_cleaned(self):
        m = self.model
        return BuyRecord._clean(m)


ds_abt = rltk.Dataset(reader=rltk.CSVReader(
    open('../../datasets/Abt-Buy/Abt.csv', encoding='latin-1')),
                      record_class=AbtRecord,
                      adapter=rltk.MemoryKeyValueAdapter())

ds_buy = rltk.Dataset(reader=rltk.CSVReader(
    open('../../datasets/Abt-Buy/Buy.csv', encoding='latin-1')),
                      record_class=BuyRecord,
                      adapter=rltk.MemoryKeyValueAdapter())

# statistics
print_details = False
name_count = model_count = description_count = price_count = brand_count = 0
for r in ds_abt:
    name_count += 1
    print('------\nname:', r.name) if print_details else ''
    if len(r.description) > 0:
        description_count += 1
Exemplo n.º 21
0
import pandas as pd
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, precision_recall_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
import numpy as np

from featurize import featurize, get_document_frequency, featurize_record_pair, TRAIN_DOC_SIZE
from utils import impute_df, DATASET_DIR
from amazon_record import AmazonRecord
from google_record import GoogleRecord

ds_amzn = rltk.Dataset(reader=rltk.CSVReader(
    open(DATASET_DIR + 'Amazon.csv', encoding='latin-1')),
                       record_class=AmazonRecord,
                       adapter=rltk.MemoryAdapter())

ds_goog = rltk.Dataset(reader=rltk.CSVReader(
    open(DATASET_DIR + 'GoogleProducts.csv', encoding='latin-1')),
                       record_class=GoogleRecord,
                       adapter=rltk.MemoryAdapter())


def generate_features(gt_train):
    """
    Generate features from stratifed ground truth DataFrames

    Params:
        gt_train: (DataFrame) Df containing statified training data ids and labels
    """
Exemplo n.º 22
0
import rltk


raw_inputs = [
    {'name': 'a1', 'age': 10, 'id': 1},
    {'name': 'a2', 'age': 20, 'id': 2},
    {'name': 'a3', 'age': 30, 'id': 3},
    {'name': 'a3', 'age': 30, 'id': 4},
    {'name': 'a1', 'age': 10, 'id': 5},
]


class MyRecord(rltk.Record):

    @property
    def id(self):
        return str(self.raw_object['id'])

    @property
    def name(self):
        return self.raw_object['name']

    @property
    def age(self):
        return self.raw_object['age']


ds = rltk.Dataset(reader=rltk.ArrayReader(raw_inputs), record_class=MyRecord)
for r, r_ in rltk.get_record_pairs(ds):
    print('comparing', r.id, r_.id, r.name == r_.name and r.age == r_.age)
Exemplo n.º 23
0
    def name(self):
        return self.raw_object['name']

    @rltk.cached_property
    def laptop(self):
        return self.raw_object['laptop_brand']


@rltk.remove_raw_object
class EvaluationRecord2(rltk.Record):
    @rltk.cached_property
    def id(self):
        return self.raw_object['id']

    @rltk.cached_property
    def name(self):
        return self.raw_object['name']

    @rltk.cached_property
    def laptop(self):
        return self.raw_object['laptop']


dataset_1_file_name = 'data_1.csv'
dataset_2_file_name = 'data_2.csv'

ds1 = rltk.Dataset(reader=rltk.CSVReader(dataset_1_file_name),
                   record_class=EvaluationRecord)
ds2 = rltk.Dataset(reader=rltk.CSVReader(dataset_2_file_name),
                   record_class=EvaluationRecord2)
Exemplo n.º 24
0
        return self.raw_object['ident']

    @rltk.cached_property
    def first_name(self):
        return self.raw_object['name'].split(' ')[0]

    @rltk.cached_property
    def last_name(self):
        return self.raw_object['name'].split(' ')[1]

    @property
    def full_name(self):
        return self.first_name + ' ' + self.last_name


ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv', delimiter=','),
                   record_class=Record1)
ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'), record_class=Record2)

ngram = rltk.NGramTokenizer()

bg = rltk.TokenBlockGenerator()
block1 = bg.block(ds1,
                  function_=lambda r: ngram.basic(r.first_name, 3),
                  block=rltk.Block(
                      rltk.LevelDbKeySetAdapter('block_store',
                                                'b1',
                                                clean=True)))
block2 = bg.block(ds2,
                  function_=lambda r: ngram.basic(r.first_name, 3),
                  block=rltk.Block(
                      rltk.LevelDbKeySetAdapter('block_store',
Exemplo n.º 25
0
class Record2(rltk.Record):
    @rltk.cached_property
    def id(self):
        return self.raw_object['ident']

    @rltk.cached_property
    def first_name(self):
        return self.raw_object['name'].split(' ')[0]

    @rltk.cached_property
    def last_name(self):
        return self.raw_object['name'].split(' ')[1]


ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv', delimiter=','),
                   record_class=Record1,
                   adapter=rltk.MemoryAdapter())
ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'),
                   record_class=Record2,
                   adapter=rltk.MemoryAdapter())

# for r in ds1:
#     print(r.id, r.first_name, r.last_name)
# for r in ds2:
#     print(r.id, r.first_name, r.last_name)

block_writer = rltk.BlockFileWriter('blocks.jl')
# block_writer = rltk.BlockArrayWriter()
block_writer.write('1', 'a')
block_writer.write('2', 'b')
block_writer.write('2', 'd')
Exemplo n.º 26
0
def main():
    with open("dblp_final_JSON.json", "r") as f:
        dblp_dict = json.load(f)

    professors = set()
    for key in dblp_dict:
        professors.add(key['person'])

    #print(professors)
    #print(len(professors))

    coauthor_dict = defaultdict(list)
    for key in dblp_dict:
        author = key['person']
        for items in key['papers']:
            co_authors = items['co_authors']
            if author in co_authors:
                co_authors.remove(author)
            if co_authors:
                coauthor_dict[author].extend(co_authors)

    list_of_coauthors = []
    for key in coauthor_dict:
        list_of_coauthors.extend(coauthor_dict[key])
    #print(len(list_of_coauthors))

    ### String / Data Matching for Entity linking using RLTK

    ### Remove duplicates in the coauthor_dict using String Matching
    ### Compare with professors and do entity linking / remove duplicates

    df1 = pd.DataFrame(list(professors), columns=['name'])
    #print(df1)
    df2 = pd.DataFrame(list_of_coauthors, columns=['name'])
    #print(len(df2))
    df1['first_name'] = df1.apply(lambda x: x['name'].split()[0], axis=1)
    df1['last_name'] = df1.apply(lambda x: ' '.join(x['name'].split()[1:]),
                                 axis=1)
    df1['id'] = (df1.index + 1).astype(str)

    #print(df1)
    df2['first_name'] = df2.apply(lambda x: x['name'].split()[0], axis=1)
    df2['last_name'] = df2.apply(lambda x: ' '.join(x['name'].split()[1:]),
                                 axis=1)
    df2['id'] = (df2.index + 1).astype(str)

    ds1 = rltk.Dataset(reader=rltk.DataFrameReader(df1),
                       record_class=Record1,
                       adapter=rltk.MemoryKeyValueAdapter())
    ds2 = rltk.Dataset(reader=rltk.DataFrameReader(df2),
                       record_class=Record2,
                       adapter=rltk.MemoryKeyValueAdapter())
    bg = rltk.HashBlockGenerator()
    block = bg.generate(bg.block(ds1, property_='fname'),
                        bg.block(ds2, property_='fname'))
    pairs = rltk.get_record_pairs(ds1, ds2, block=block)
    num_pairs = 0
    sim_pairs = []
    sim_dict = {}
    for r1, r2 in pairs:
        num_pairs += 1
        sim = rltk.jaro_winkler_similarity(r1.lname, r2.lname)
        if 0.9 < sim < 1:
            sim_pairs.append(
                (r1.fname + ' ' + r1.lname, r2.fname + ' ' + r2.lname))
            sim_dict[r1.fname + ' ' + r1.lname] = r2.fname + ' ' + r2.lname
            #print(r1.lname,r2.lname,sim)
    #print(sim_pairs)
    #print("Blocking using Cuisine - Number of pairs:",num_pairs)
    for key in coauthor_dict:
        lis = coauthor_dict[key]
        for ind in range(len(lis)):
            if lis[ind] in sim_dict:
                lis[ind] = sim_dict[lis[ind]]

    with open("co_authors.json", "w") as jf:
        json.dump(coauthor_dict, jf, indent=2)