예제 #1
0
    def __init__(self,
                 rawdata_dir,
                 max_docs_per_item=50,
                 gold_sterr=0.5,
                 n_items=0):
        self.df = pd.read_csv(rawdata_dir,
                              sep=" ",
                              error_bad_lines=False,
                              names=["topic_item", "na", "doc", "gold"])
        self.df = self.df[self.df.na == 0]
        self.df = self.trunc_docs(max_docs_per_item)
        topics = self.df.topic_item.unique()
        n_items = max(n_items, len(topics))
        extradfs = []
        for i in range(n_items - len(topics)):
            topic = np.random.choice(topics)
            tdf = self.df[self.df.topic_item == topic]
            new_topic = topic * (10 + i)
            newdf = tdf.copy(deep=True)
            newdf.topic_item = new_topic
            newdf.gold = np.random.permutation(newdf.gold.values)
            extradfs.append(newdf)
        if len(extradfs):
            self.df = pd.concat([self.df] + extradfs)
        self.df.gold = self.df.gold + np.random.normal(0, gold_sterr,
                                                       len(self.df.gold))
        self.topic_lookup = utils.make_categorical(self.df, "topic_item")
        self.doc_lookup = utils.make_categorical(self.df, "doc")

        def rank_docs(data):
            return data.sort_values("gold", ascending=False).doc.values

        self.gold = self.df.groupby("topic_item").apply(rank_docs)
예제 #2
0
 def setup(self,
           annodf,
           golddf,
           c_anno_uid=None,
           c_anno_item=None,
           c_anno_label=None,
           c_gold_item=None,
           c_gold_label=None):
     renamey = lambda y: self.label_colname if "label" in y else self.item_colname if "item" in y else self.uid_colname if "uid" in y else "_"
     localargs = locals()
     colrename = {
         localargs[k]: renamey(k)
         for k in localargs if "c_" in k and localargs[k] is not None
     }
     self.annodf = annodf[[
         c_anno_uid or self.uid_colname, c_anno_item or self.item_colname,
         c_anno_label or self.label_colname
     ]]
     self.annodf = self.annodf.dropna().copy().rename(columns=colrename)[[
         self.uid_colname, self.item_colname, self.label_colname
     ]]
     uiddict = utils.make_categorical(self.annodf, "uid")
     itemdict = utils.make_categorical(self.annodf, "item")
     golddf = golddf[[
         c_gold_item or self.item_colname, c_gold_label
         or self.label_colname
     ]]
     golddf = golddf.rename(columns=colrename)[[
         self.item_colname, self.label_colname
     ]]
     golddf = utils.translate_categorical(golddf, self.item_colname,
                                          itemdict)
     self.golddict = golddf.set_index("item").to_dict()[self.label_colname]
     self.produce_stan_data()
예제 #3
0
 def setup(self):
     userIDs = []
     itemIDs = []
     labels = []
     golds = []
     hmmcrowds = []
     majorityvotes = []
     for row in self.rawdf.iterrows():
         itemID = row[1]["docid"]
         data = row[1]["Participants"]
         gold = self.golddf[self.golddf["docid"] ==
                            itemID]["Participants"].values[0]
         gold = gold.get("MedicalStudent")
         if gold is None:
             continue
         agg = self.aggdf[self.aggdf["docid"] ==
                          itemID]["Participants"].values[0]
         for userID, label in data.items():
             userIDs.append(userID)
             itemIDs.append(itemID)
             labels.append(label2tvr(label, default=[]))
             golds.append(label2tvr(gold))
             hmmcrowds.append(agg["HMMCrowd"])
             majorityvotes.append(agg["MajorityVote"])
     df = pd.DataFrame({
         "uid": userIDs,
         "itemID": itemIDs,
         "label": labels,
         "gold": golds,
         "HMMCrowd": hmmcrowds,
         "MajorityVote": majorityvotes
     })
     df = df.sort_values("itemID")
     userIdDict = utils.make_categorical(df, "uid")
     itemIdDict = utils.make_categorical(df, "itemID")
     anno_df = df.copy()
     super().setup(anno_df, anno_df, c_gold_label="gold")
     mv_labels = {
         k: label2tvr(v)
         for k, v in dict(
             df.groupby("itemID").first()["MajorityVote"].dropna()).items()
     }
     hmm_labels = {
         k: label2tvr(v)
         for k, v in dict(
             df.groupby("itemID").first()["HMMCrowd"].dropna()).items()
     }
     self.register_baseline("Tokenwise MV", mv_labels)
     self.register_baseline("Crowd-HMM", hmm_labels)
예제 #4
0
 def setup(self, annodf, golddf=None, c_anno_uid=None, c_anno_item=None, c_anno_label=None, c_gold_item=None, c_gold_label=None, merge_index=None):
     renamey = lambda y: self.label_colname if "label" in y else self.item_colname if "item" in y else self.uid_colname if "uid" in y else y
     localargs = locals()
     colrename = {localargs[k]:renamey(k) for k in localargs if "c_" in k and localargs[k] is not None}
     self.annodf = annodf[[c_anno_uid or self.uid_colname, c_anno_item or self.item_colname, c_anno_label or self.label_colname]]
     self.annodf = self.annodf.rename(columns=colrename)[[self.uid_colname, self.item_colname, self.label_colname]]
     if merge_index is not None:
         self.merge_index_colname = merge_index
         self.annodf[merge_index] = annodf[merge_index]
     self.annodf = self.annodf.dropna().copy()
     self.uiddict = utils.make_categorical(self.annodf, self.uid_colname)
     self.itemdict = utils.make_categorical(self.annodf, self.item_colname)
     if golddf is not None:
         golddf = golddf[[c_gold_item or self.item_colname, c_gold_label or self.label_colname]]
         golddf = golddf.rename(columns=colrename)[[self.item_colname, self.label_colname]]
         golddf = utils.translate_categorical(golddf, self.item_colname, self.itemdict)
         self.golddict = golddf.set_index(self.item_colname).to_dict()[self.label_colname]
         self.golddict = {k: v for k, v in self.golddict.items() if v is not None}
     self.produce_stan_data()
예제 #5
0
    def __init__(self,
                 rawdata_dir='data/coco/person_keypoints_train2017.json',
                 max_items=500,
                 minlabelsperitem=4):
        with open(rawdata_dir) as f:
            dataset = json.load(f)
        self.category_id_skeletons = {
            c["id"]: np.array(c["skeleton"]) - 1
            for c in iter(dataset["categories"])
        }

        img_label = {}
        for dataset_annotation in iter(dataset["annotations"]):
            v = img_label.setdefault(dataset_annotation["image_id"], [])
            v.append(dataset_annotation)
        img_label_minlen = {
            k: v
            for k, v in img_label.items() if len(v) >= minlabelsperitem
        }

        i = 0
        rows = []
        item = []
        annotation = []
        category = []
        for dataset_annotations in iter(img_label_minlen.values()):
            for dataset_annotation in dataset_annotations:
                kp = np.reshape(dataset_annotation["keypoints"], (-1, 3))
                kp = kp[kp[:, 2] > -90][:, :2]
                if len(kp) == 0:
                    continue
                item.append(dataset_annotation["image_id"])
                annotation.append(kp)
                category.append(dataset_annotation["category_id"])
            i += 1
            if i > max_items:
                break
        kp_df = pd.DataFrame({
            "item": item,
            "gold": annotation,
            "category": category
        })
        self.df = kp_df.groupby("item")["gold"].apply(list).reset_index()
        self.itemdict = utils.make_categorical(self.df, "item")
 def __init__(self, rawdata_dir, max_items=10000):
     self.df = pd.read_csv(rawdata_dir, error_bad_lines=False, header=None, sep=" ", names=["img", "x", "y", "w", "h"])
     self.df = self.df[:max_items]
     self.df["goldcoords"] = self.df.apply(lambda row: [row["x"], row["y"], row["x"] + row["w"], row["y"] + row["h"]], axis=1)
     self.img_lookup = utils.make_categorical(self.df, "img")
     self.gold = self.df.set_index("img")["goldcoords"]