示例#1
0
def perdict(caseversions, model):

    #print(caseversions['meta'])
    caseversions_sorted_by_id = sorted(caseversions['objects'], key=lambda x: x['id'])
    caseversion_texts = map(lambda x: json.dumps(x), caseversions_sorted_by_id)

    vect = TfidfVectorizer(min_df=1)
    tfidf = vect.fit_transform(caseversion_texts)
    pairwise_similarity = tfidf * tfidf.T

    #print(pairwise_similarity.shape)
    features = []
    case_ids= []
    #pdb.set_trace()
    p = ProgressBar(len(list(itertools.combinations(range(len(caseversion_texts)),2))))

    #sorting by similarity
    #reindex = np.argsort(-pairwise_similarity.A.flatten())
    #r, c = divmod(reindex, pairwise_similarity.shape[1])
    #dups = filter(lambda (ri,ci): ri < ci, zip(r,c))

    counter = 0
    for i, j in itertools.combinations(range(len(caseversion_texts)),2):
        try:
            p.update(counter)
            counter += 1
            #print([i,j])
            case_ids.append({
                'lhs_id':caseversions_sorted_by_id[i]['id'],
                'rhs_id':caseversions_sorted_by_id[j]['id']
            })
            diff  = filters.calcDiff(
                json.dumps(caseversions_sorted_by_id[i]),
                json.dumps(caseversions_sorted_by_id[j])
            )
            features.append({
                "similarity": pairwise_similarity[i, j],
                "isonoff": filters.isOnOffPairs(diff),
                "isdiffmodule": filters.isDifferentModule(diff)
                #"tfidf_diff": tfidf[i] - tfidf[j]
            })
        except KeyboardInterrupt:
            if len(case_ids) != len(features):
                old_len = min(len(case_ids), len(features))
                case_ids = case_ids[:old_len]
                features = features[:old_len]
            break


    vec = DictVectorizer()
    vectorized_features = vec.fit_transform(features)
    p.done()

    return {'ids': case_ids, 'perdictions':model.predict(vectorized_features)}
示例#2
0
def prepare_training_data(caseversions):

    #print(caseversions['meta'])
    caseversions_sorted_by_id = sorted(caseversions['objects'], key=lambda x: x['id'])
    #idx_from_caseversion_id = dict((d['id'], dict(d, index=i)) for (i, d) in enumerate(x))
    idx_from_caseversion_id = dict((str(d['id']), i) for (i, d) in enumerate(caseversions_sorted_by_id))
    #TODO: can we reduce the number of cases here?
    #TODO: find the intersection between the groundtruth and the caseversions
    caseversion_texts = map(lambda x: json.dumps(x), caseversions_sorted_by_id)

    vect = TfidfVectorizer(min_df=1)
    tfidf = vect.fit_transform(caseversion_texts)
    pairwise_similarity = tfidf * tfidf.T

    groundtruth = loadGroundTruth(groundtruth_filename)
    #print(pairwise_similarity.shape)
    features = []
    #case_ids= []
    #pdb.set_trace()
    p = ProgressBar(len(list(itertools.combinations(range(len(caseversion_texts)),2))))

#        item['diff'] = filters.calcDiff(item['r'], item['c'], caseversions)
# #        if filters.isOnOffPairs(item['diff']):
#            topranks[i]['are_dup']= False
#            topranks[i]['reason']= "onoff"
#
#        if filters.isDifferentModule(item['diff']):
#            topranks[i]['are_dup']= False
#            topranks[i]['reason']= "diffmodule"
    counter = 0
    for pair in groundtruth['ids']:
        # TODO: handle if groundtruth is not in the small set
        #Extract similarity
        try:
            r = idx_from_caseversion_id[pair['lhs_id']]
            c = idx_from_caseversion_id[pair['rhs_id']]
            similarity = pairwise_similarity[r, c] #"tfidf_diff": tfidf[i] - tfidf[j]

            diff  = filters.calcDiff(caseversion_texts[r], caseversion_texts[c])
            isonoff = filters.isOnOffPairs(diff)
            isdiffmodule = filters.isDifferentModule(diff)

        except KeyError:
            similarity = 0 # Is this good?
            isonoff = False
            isdiffmodule = False
            continue

        features.append({
            "similarity": similarity,
            "isonoff": isonoff,
            "isdiffmodule": isdiffmodule
        })
        p.update(counter)
        counter += 1
    #for i, j in itertools.combinations(range(len(caseversion_texts)),2):
        #print([i,j])
        #case_ids.append({
        #    'lhs_id':caseversions_sorted_by_id[i]['id'],
        #    'rhs_id':caseversions_sorted_by_id[j]['id']
        #})
        #features.append({
        #    "similarity": pairwise_similarity[i, j],
        #    #"tfidf_diff": tfidf[i] - tfidf[j]
        #})

    #print(json.dumps(features, indent=2))

    vec = DictVectorizer()
    vectorized_features = vec.fit_transform(features)

    p.done()
    return (vectorized_features, groundtruth['targets'])