def perdict(caseversions, model): #print(caseversions['meta']) caseversions_sorted_by_id = sorted(caseversions['objects'], key=lambda x: x['id']) caseversion_texts = map(lambda x: json.dumps(x), caseversions_sorted_by_id) vect = TfidfVectorizer(min_df=1) tfidf = vect.fit_transform(caseversion_texts) pairwise_similarity = tfidf * tfidf.T #print(pairwise_similarity.shape) features = [] case_ids= [] #pdb.set_trace() p = ProgressBar(len(list(itertools.combinations(range(len(caseversion_texts)),2)))) #sorting by similarity #reindex = np.argsort(-pairwise_similarity.A.flatten()) #r, c = divmod(reindex, pairwise_similarity.shape[1]) #dups = filter(lambda (ri,ci): ri < ci, zip(r,c)) counter = 0 for i, j in itertools.combinations(range(len(caseversion_texts)),2): try: p.update(counter) counter += 1 #print([i,j]) case_ids.append({ 'lhs_id':caseversions_sorted_by_id[i]['id'], 'rhs_id':caseversions_sorted_by_id[j]['id'] }) diff = filters.calcDiff( json.dumps(caseversions_sorted_by_id[i]), json.dumps(caseversions_sorted_by_id[j]) ) features.append({ "similarity": pairwise_similarity[i, j], "isonoff": filters.isOnOffPairs(diff), "isdiffmodule": filters.isDifferentModule(diff) #"tfidf_diff": tfidf[i] - tfidf[j] }) except KeyboardInterrupt: if len(case_ids) != len(features): old_len = min(len(case_ids), len(features)) case_ids = case_ids[:old_len] features = features[:old_len] break vec = DictVectorizer() vectorized_features = vec.fit_transform(features) p.done() return {'ids': case_ids, 'perdictions':model.predict(vectorized_features)}
def prepare_training_data(caseversions): #print(caseversions['meta']) caseversions_sorted_by_id = sorted(caseversions['objects'], key=lambda x: x['id']) #idx_from_caseversion_id = dict((d['id'], dict(d, index=i)) for (i, d) in enumerate(x)) idx_from_caseversion_id = dict((str(d['id']), i) for (i, d) in enumerate(caseversions_sorted_by_id)) #TODO: can we reduce the number of cases here? #TODO: find the intersection between the groundtruth and the caseversions caseversion_texts = map(lambda x: json.dumps(x), caseversions_sorted_by_id) vect = TfidfVectorizer(min_df=1) tfidf = vect.fit_transform(caseversion_texts) pairwise_similarity = tfidf * tfidf.T groundtruth = loadGroundTruth(groundtruth_filename) #print(pairwise_similarity.shape) features = [] #case_ids= [] #pdb.set_trace() p = ProgressBar(len(list(itertools.combinations(range(len(caseversion_texts)),2)))) # item['diff'] = filters.calcDiff(item['r'], item['c'], caseversions) # # if filters.isOnOffPairs(item['diff']): # topranks[i]['are_dup']= False # topranks[i]['reason']= "onoff" # # if filters.isDifferentModule(item['diff']): # topranks[i]['are_dup']= False # topranks[i]['reason']= "diffmodule" counter = 0 for pair in groundtruth['ids']: # TODO: handle if groundtruth is not in the small set #Extract similarity try: r = idx_from_caseversion_id[pair['lhs_id']] c = idx_from_caseversion_id[pair['rhs_id']] similarity = pairwise_similarity[r, c] #"tfidf_diff": tfidf[i] - tfidf[j] diff = filters.calcDiff(caseversion_texts[r], caseversion_texts[c]) isonoff = filters.isOnOffPairs(diff) isdiffmodule = filters.isDifferentModule(diff) except KeyError: similarity = 0 # Is this good? isonoff = False isdiffmodule = False continue features.append({ "similarity": similarity, "isonoff": isonoff, "isdiffmodule": isdiffmodule }) p.update(counter) counter += 1 #for i, j in itertools.combinations(range(len(caseversion_texts)),2): #print([i,j]) #case_ids.append({ # 'lhs_id':caseversions_sorted_by_id[i]['id'], # 'rhs_id':caseversions_sorted_by_id[j]['id'] #}) #features.append({ # "similarity": pairwise_similarity[i, j], # #"tfidf_diff": tfidf[i] - tfidf[j] #}) #print(json.dumps(features, indent=2)) vec = DictVectorizer() vectorized_features = vec.fit_transform(features) p.done() return (vectorized_features, groundtruth['targets'])