def prepare_distinct(path, out, nlp): print path c = 0 start = datetime.now() with open(out, 'w') as outfile: columns = [ 'ent_max', 'ent_min', 'ent_diff', 'ent_jaccard', ] columns = ','.join(columns) outfile.write(columns + '\n') for t, row in enumerate(DictReader(open(path), delimiter=',')): if c % 100000 == 0: print 'finished', c q1 = unicode(remove_punctuation(str(row['question1']).lower())) q2 = unicode(remove_punctuation(str(row['question2']).lower())) # spacy_sim = nlp(q1).similarity(nlp(q2)) # print q1,q2 # q1,q2 = distinct_terms(q1,q2) # spacy_sim_distinct = nlp(unicode(q1)).similarity(nlp(unicode(q2))) q1 = nlp(q1) q2 = nlp(q2) # q1_ent = [ent.label_ for ent in q1.ents] # q2_ent = [ent.label_ for ent in q2.ents] q1_ent = [ent.text for ent in q1.ents] q2_ent = [ent.text for ent in q2.ents] q1_len = len(q1_ent) q2_len = len(q2_ent) ent_max = max(q1_len, q2_len) ent_min = min(q1_len, q2_len) ent_diff = ent_max - ent_min # print(q1,q2) # print(q1_ent,q2_ent) try: ent_jaccard = get_jaccard(q1_ent, q2_ent) except: ent_jaccard = -1 features = ( ent_max, ent_min, ent_diff, ent_jaccard, ) outfile.write('%s,%s,%s,%s\n' % features) c += 1 end = datetime.now() print 'times:', end - start
def prepare_hash_df(path, out, neighbour_dict, df_dict): n_qids = float(len(neighbour_dict.keys())) print path c = 0 start = datetime.now() with open(out, 'w') as outfile: outfile.write( 'max_entropy,min_entropy,jaccard,intersection,intersection_entropy\n' ) for t, row in enumerate(DictReader(open(path), delimiter=',')): if c % 100000 == 0: print 'finished', c q1 = str(row['question1_hash']) q2 = str(row['question2_hash']) q1_df = neighbour_dict.get(q1, []) q2_df = neighbour_dict.get(q2, []) HA = 0.0 for q in q1_df: q_df = df_dict.get(q, 1) HA += -(q_df / n_qids) * log(q_df / n_qids) HB = 0.0 for q in q2_df: q_df = df_dict.get(q, 1) HB += -(q_df / n_qids) * log(q_df / n_qids) qmax = max(HA, HB) qmin = min(HA, HB) intersection = set(q1_df).intersection(set(q2_df)) H_intersection = 0.0 for q in intersection: q_df = df_dict.get(q, 1) H_intersection += -(q_df / n_qids) * log(q_df / n_qids) jaccard = get_jaccard(q1_df, q2_df) outfile.write( '%s,%s,%s,%s,%s\n' % (qmax, qmin, jaccard, len(intersection), H_intersection)) c += 1 end = datetime.now() print 'times:', end - start
def prepare_ngram_interaction(path, out, ngram='unigram'): data_input = pd.read_csv(path) data_ouput = DataFrame(columns=[ 'jaccard_' + ngram, 'dice_' + ngram, 'count_s1_in_s2_' + ngram, 'ratio_s1_in_s2_' + ngram, 'count_of_sen1_' + ngram, 'count_of_sen2_' + ngram, 'count_of_unique_sen1_' + ngram, 'count_of_unique_sen2_' + ngram, 'ratio_of_unique_sen1_' + ngram, 'ratio_of_unique_sen2_' + ngram, 'count_of_digit_sen1_' + ngram, 'count_of_digit_sen2_' + ngram, 'ratio_of_digit_sen1_' + ngram, 'ratio_of_digit_sen2_' + ngram ]) for index, row in data_input.iterrows(): s1_ngram = str(row['sen1_%s' % ngram]).split() s2_ngram = str(row['sen2_%s' % ngram]).split() jaccard = get_jaccard(s1_ngram, s2_ngram) dice = get_dice(s1_ngram, s2_ngram) count_s1_in_s2 = get_count_s1_in_s2(s1_ngram, s2_ngram) ratio_s1_in_s2 = get_ratio_s1_in_s2(s1_ngram, s2_ngram) count_of_sen1 = get_count_of_sen(s1_ngram) count_of_sen2 = get_count_of_sen(s2_ngram) count_of_unique_sen1 = get_count_of_unique_sen(s1_ngram) count_of_unique_sen2 = get_count_of_unique_sen(s2_ngram) ratio_of_unique_sen1 = get_ratio_of_unique_sen(s1_ngram) ratio_of_unique_sen2 = get_ratio_of_unique_sen(s2_ngram) count_of_digit_sen1 = get_count_of_digit(s1_ngram) count_of_digit_sen2 = get_count_of_digit(s2_ngram) ratio_of_digit_sen1 = get_ratio_of_digit(s1_ngram) ratio_of_digit_sen2 = get_ratio_of_digit(s2_ngram) data_ouput.loc[index] = [ jaccard, dice, count_s1_in_s2, ratio_s1_in_s2, count_of_sen1, count_of_sen2, count_of_unique_sen1, count_of_unique_sen2, ratio_of_unique_sen1, ratio_of_unique_sen2, count_of_digit_sen1, count_of_digit_sen2, ratio_of_digit_sen1, ratio_of_digit_sen2 ] data_ouput.to_csv(out, index=False)
def generate_ngram_inter(path,out): print('generate basic features,data path is',path) c = 0 start = datetime.now() with open(out, 'w') as outfile: outfile.write('jaccard,dice,count_q1_in_q2,ratio_q1_in_q2,count_of_sen1,count_of_sen2,count_of_unique_sen1,count_of_unique_sen2,ratio_of_unique_sen1,ratio_of_unique_sen2,count_of_digit_sen1,count_of_digit_sen2,ratio_of_digit_sen1,ratio_of_digit_sen2,count_of_sen_min,count_of_sen_max,count_of_unique_sen_min,count_of_unique_sen_max,ratio_of_unique_sen_min,ratio_of_unique_sen_max,count_of_digit_sen_min,count_of_digit_sen_max,ratio_of_digit_sen_min,ratio_of_digit_sen_max\n') for t, row in enumerate(DictReader(open(path), delimiter=',')): sen1 = str(row['sen1']).split() sen2 = str(row['sen2']).split() jaccard = get_jaccard(sen1,sen2) dice = get_dice(sen1,sen2) count_q1_in_q2 = get_count_q1_in_q2(sen1,sen2) ratio_q1_in_q2 = get_ratio_q1_in_q2(sen1,sen2) count_of_sen1 = get_count_of_sen(sen1) count_of_sen2 = get_count_of_sen(sen2) count_of_sen_min = min(count_of_sen1,count_of_sen2) count_of_sen_max = max(count_of_sen1,count_of_sen2) count_of_unique_sen1 = get_count_of_unique_sen(sen1) count_of_unique_sen2 = get_count_of_unique_sen(sen2) count_of_unique_sen_min = min(count_of_unique_sen1,count_of_unique_sen2) count_of_unique_sen_max = max(count_of_unique_sen1,count_of_unique_sen2) ratio_of_unique_sen1 = get_ratio_of_unique_sen(sen1) ratio_of_unique_sen2 = get_ratio_of_unique_sen(sen2) ratio_of_unique_sen_min = min(ratio_of_unique_sen1,ratio_of_unique_sen2) ratio_of_unique_sen_max = max(ratio_of_unique_sen1,ratio_of_unique_sen2) count_of_digit_sen1 = get_count_of_digit(sen1) count_of_digit_sen2 = get_count_of_digit(sen2) count_of_digit_sen_min = min(count_of_digit_sen1,count_of_digit_sen2) count_of_digit_sen_max = max(count_of_digit_sen1,count_of_digit_sen2) ratio_of_digit_sen1 = get_ratio_of_digit(sen1) ratio_of_digit_sen2 = get_ratio_of_digit(sen2) ratio_of_digit_sen_min = min(ratio_of_digit_sen1,ratio_of_digit_sen2) ratio_of_digit_sen_max = max(ratio_of_digit_sen1,ratio_of_digit_sen2) outfile.write('%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % ( jaccard, dice, count_q1_in_q2,ratio_q1_in_q2, count_of_sen1,count_of_sen2, count_of_unique_sen1,count_of_unique_sen2, ratio_of_unique_sen1,ratio_of_unique_sen2, count_of_digit_sen1,count_of_digit_sen2, ratio_of_digit_sen1,ratio_of_digit_sen2, count_of_sen_min,count_of_sen_max, count_of_unique_sen_min,count_of_unique_sen_max, ratio_of_unique_sen_min,ratio_of_unique_sen_max, count_of_digit_sen_min,count_of_digit_sen_max, ratio_of_digit_sen_min,ratio_of_digit_sen_max, )) c+=1 end = datetime.now() print('times:',end-start)
def prepare_ngram_interaction(path, out, ngram='unigram'): print path c = 0 start = datetime.now() with open(out, 'w') as outfile: outfile.write( 'jaccard,dice,count_q1_in_q2,ratio_q1_in_q2,count_of_question1,count_of_question2,count_of_unique_question1,count_of_unique_question2,ratio_of_unique_question1,ratio_of_unique_question2,count_of_digit_question1,count_of_digit_question2,ratio_of_digit_question1,ratio_of_digit_question2\n' ) for t, row in enumerate(DictReader(open(path), delimiter=',')): if c % 100000 == 0: print 'finished', c q1_ngram = str(row['question1_%s' % ngram]).split() q2_ngram = str(row['question2_%s' % ngram]).split() jaccard = get_jaccard(q1_ngram, q2_ngram) dice = get_dice(q1_ngram, q2_ngram) count_q1_in_q2 = get_count_q1_in_q2(q1_ngram, q2_ngram) ratio_q1_in_q2 = get_ratio_q1_in_q2(q1_ngram, q2_ngram) count_of_question1 = get_count_of_question(q1_ngram) count_of_question2 = get_count_of_question(q2_ngram) count_of_question_min = min(count_of_question1, count_of_question2) count_of_question_max = max(count_of_question1, count_of_question2) count_of_unique_question1 = get_count_of_unique_question(q1_ngram) count_of_unique_question2 = get_count_of_unique_question(q2_ngram) count_of_unique_question_min = min(count_of_unique_question1, count_of_unique_question2) count_of_unique_question_max = max(count_of_unique_question1, count_of_unique_question2) ratio_of_unique_question1 = get_ratio_of_unique_question(q1_ngram) ratio_of_unique_question2 = get_ratio_of_unique_question(q2_ngram) ratio_of_unique_question_min = min(ratio_of_unique_question1, ratio_of_unique_question2) ratio_of_unique_question_max = max(ratio_of_unique_question1, ratio_of_unique_question2) count_of_digit_question1 = get_count_of_digit(q1_ngram) count_of_digit_question2 = get_count_of_digit(q2_ngram) count_of_digit_question_min = min(count_of_digit_question1, count_of_digit_question2) count_of_digit_question_max = max(count_of_digit_question1, count_of_digit_question2) ratio_of_digit_question1 = get_ratio_of_digit(q1_ngram) ratio_of_digit_question2 = get_ratio_of_digit(q2_ngram) ratio_of_digit_question_min = min(ratio_of_digit_question1, ratio_of_digit_question2) ratio_of_digit_question_max = max(ratio_of_digit_question1, ratio_of_digit_question2) outfile.write('%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % ( jaccard, dice, count_q1_in_q2, ratio_q1_in_q2, count_of_question_min, count_of_question_max, count_of_unique_question_min, count_of_unique_question_max, ratio_of_unique_question_min, ratio_of_unique_question_max, count_of_digit_question_min, count_of_digit_question_max, ratio_of_digit_question_min, ratio_of_digit_question_max, )) c += 1 end = datetime.now() print 'times:', end - start