def retrieveCatLabels(Y_out, training_data_file_name): training_data_file_name_prefix=training_data_file_name.split("-")[0] unrolled_data_file_name=training_data_file_name_prefix+explore.unrolled_file_name_suffix unrolled_data=explore.read_unrolled_data(unrolled_data_filename=unrolled_data_file_name) postings_sorted_by_freq_file=open(postings_sorted_by_freq_file_name) query_data=getQueryData(unrolled_data=unrolled_data, Y_out=Y_out, use_stemming=True) for index in range(len(query_data)): query_terms=query_data[index].query_terms postings_set=Set([]) print "processing term ", index iteration=0 while 1: print "iteration : ",iteration iteration+=1 lines=postings_sorted_by_freq_file.readlines(100000000) if not lines: break print "initiating temp_dict" temp_postings_dict={} for line in lines: (token, postings)=(ps.stem(line.split(":")[1].lower().strip()), line.split(":")[2].lower().strip().split()) temp_postings_dict[token]=postings #at this point, our temp dictionary is built temp_postings_dict_keys=Set(temp_postings_dict.keys()) print "completed building temp_dict. len(dict) = ", len(temp_postings_dict) for term in query_terms: if term in temp_postings_dict_keys: if len(postings_set)==0: postings_set=Set(temp_postings_dict[term]) else: postings_set=postings_set & Set(temp_postings_dict[term]) query_data[index].postings=list(postings_set) print "size of postings = ",len(query_data[index].postings), " postings : ", query_data[index].postings[0:10] raw_input("continue : ")
def gen_stanford_ner_training_data(): ''' use a BILOU annotations scheme: B=beginning, I=inside, L=last, O=outside, U=unit-length turns out, the BILOU scheme has ~ 25% F1 score on the stanford system ''' train_test_split=0.25 Y_ref=pickle.load(open('Y_ref.pkl','r')) unrolled_data_filename=open('training_unrolled_data.txt','r') stanford_ner_training_data_filename=open('stanford_ner_training_data_25_split.txt','w') stanford_ner_testing_data_filename=open('stanford_ner_testing_data.txt','w') unrolled_data=explore.read_unrolled_data(unrolled_data_filename=unrolled_data_filename) for index in range((len(Y_ref)-1)): if Y_ref[index]==0: label='O' elif Y_ref[index]==1 and Y_ref[index-1]==0 and Y_ref[index+1]==1: label='B' elif Y_ref[index]==1 and Y_ref[index-1]==1 and Y_ref[index+1]==1: label='I' elif Y_ref[index]==1 and Y_ref[index-1]==1 and Y_ref[index+1]==0: label='L' elif Y_ref[index]==1 and Y_ref[index-1]==0 and Y_ref[index+1]==0: label='U' record=unrolled_data[index][3]+"\t"+label+"\n" if random.random()<train_test_split: stanford_ner_training_data_filename.write(record) stanford_ner_testing_data_filename.write(record)
def get_cprod_baseline_dict_mapping(input_file_name): ''' whether a given token is a brandname, common english word or not found in dictionary supplied by the competition baseline2. ''' cprod_baseline_dict_mapping_prods={'brandname':1.0, 'merchant':2.0} #if not found, mark it as -1.0. cprod_baseline_dict_mapping_lang={'encommonword':1.0, 'grammaticalword':2.0} input_file_name_prefix=input_file_name.split("-")[0] unrolled_file_name_suffix='_unrolled_data.txt' unrolled_file_name=input_file_name_prefix+unrolled_file_name_suffix unrolled_data=explore.read_unrolled_data(unrolled_data_filename=unrolled_file_name) cprod_data=read_n_filter_cprod_baseline_dict() cprod_keys=Set(cprod_data.keys()) X_prods=np.ones(len(unrolled_data))*-1.0 #the default assumption is that the avg token is not a brandname or merchant name X_lang=np.zeros(len(unrolled_data)) #if token is not found in dictionary, it is likely close to a common english word for index, item in enumerate(unrolled_data): (textID, offset, lineNo, token)=item token=token.lower().strip() if token in cprod_keys: label=cprod_data[token] if label in cprod_baseline_dict_mapping_prods: X_prods[index]=cprod_baseline_dict_mapping_prods[label] if label in cprod_baseline_dict_mapping_lang: X_lang[index]=cprod_baseline_dict_mapping_lang[label] return (X_prods, X_lang)