def prepare_a_tf(corpus_root, corpus, year, mallet_act_results): tf_file = pnames.tv_filepath(corpus_root, corpus, year, "tf", "", "") # subset and cat_type are fixed as "a" and "pn" a_tf_file = pnames.tv_filepath(corpus_root, corpus, year, "tf", "a", "pn") res_lines = open(mallet_act_results, 'r').readlines() tf_lines = open(tf_file, 'r').readlines() s_a_tf = open(a_tf_file, 'w') attDict = {} print "building attributes dict..." for i in range(0, len(res_lines)): fields = res_lines[i].split() tuples = [('c',float(fields[2])), ('t',float(fields[4])), ('a',float(fields[6]))] num_tuples = [e[1] for e in tuples] max_list = [e[0] for e in tuples if e[1] == max(num_tuples)] if 'a' in max_list: attDict[fields[0]] = 1 count = 0 #sortedKeys = sorted(attDict.keys()) for line in tf_lines: count+=1 if count % 100000 == 0: print count term = '_'.join(line.split('\t')[0].split()) if term in attDict: s_a_tf.write(line) s_a_tf.close()
def run_classify(corpus, year, cat_type, subset=""): # corpus_root = "/home/j/anick/patent-classifier/ontology/creation/data/patents/" #tv_loc = "/data/tv/" outfile_qualifier = "cat" priors_qualifier = "cat_prob" terms_qualifier = "tf" term2freq_qualifier = "terms" lfgc_qualifier = "fc_kl" ################ variable parts of path outfile_year = str(year) year_cat_name = outfile_year + "." + cat_type #corpus = "ln-us-cs-500k" #corpus = "ln-us-12-chemical" ################ #print "[run_classify]Output dir: %s" % tv_loc #path_to_terms_file = outroot + corpus + tv_loc + outfile_year + "." #path_to_terms_file = pnames.tv_filepath(corpus_root, corpus, year, "tf", subset, "") #path_to_file = outroot + corpus + tv_loc + year_cat_name + "." #priors_file = path_to_file + priors_qualifier priors_file = pnames.tv_filepath(corpus_root, corpus, year, priors_qualifier, subset, cat_type) #terms_file = path_to_terms_file + terms_qualifier terms_file = pnames.tv_filepath(corpus_root, corpus, year, terms_qualifier, subset, "") #lfgc_file = path_to_file + lfgc_qualifier lfgc_file = pnames.tv_filepath(corpus_root, corpus, year, lfgc_qualifier, subset, cat_type) #term2freq_file = path_to_terms_file + term2freq_qualifier term2freq_file = pnames.tv_filepath(corpus_root, corpus, year, term2freq_qualifier, "", "") # compute l_cats, l_priors, d_lfgc, d_term2feats once and use them to run several thresholds print "[nbayes.py]priors_file: %s" % priors_file (l_cats, l_priors) = populate_priors(priors_file) print "[nbayes.py]lfgc_file: %s" % lfgc_file d_lfgc = populate_lfgc(lfgc_file) print "[nbayes.py]terms_file: %s" % terms_file d_term2feats = populate_terms(terms_file) print "[nbayes.py]term2freq_file: %s" % term2freq_file d_term2freq = populate_term2freq(term2freq_file) # min_weight = .2 #for min_weight in [.1, .2]: for cutoff in [.1, .05, .0]: cutoff_qualifier = role.cat_cutoff_file_type(cutoff) #outfile = path_to_file + outfile_qualifier + ".w" + cutoff_qualifier outfile = pnames.tv_filepath(corpus_root, corpus, year, cutoff_qualifier, subset, cat_type) print "[nbayes.py]classifying into outfile: %s" % outfile classify(l_cats, l_priors, d_lfgc, d_term2feats, d_term2freq, cutoff, outfile)
def run_domain_score(corpus1, corpus1_size, corpus2, corpus2_size, year): # corpus_root = "/home/j/anick/patent-classifier/ontology/creation/data/patents/" #outfile_name = corpus1 + "_" + corpus2 + ".ds" outfile = pnames.tv_filepath(corpus_root, corpus1, year, "ds", "", "") f_terms1 = pnames.tv_filepath(corpus_root, corpus1, year, "terms", "", "") f_terms2 = pnames.tv_filepath(corpus_root, corpus2, year, "terms", "", "") domain_score(f_terms1, corpus1_size, f_terms2, corpus2_size, outfile)
def cat_filter(corpus_root, corpus, year, cat_type, subset, min_freq, min_domain_score, max_freq): cat_file_type = "cat.w0.0" f_cat = pnames.tv_filepath(corpus_root, corpus, year, cat_file_type, subset, cat_type) f_ds = pnames.tv_filepath(corpus_root, corpus, year, "ds", "", "") out_file_type = cat_file_type + "_r" + str(min_freq) + "-" + str( max_freq) + "_ds" + str(min_domain_score) f_out = pnames.tv_filepath(corpus_root, corpus, year, out_file_type, subset, cat_type) d_term2cat = {} d_term2ds = {} s_cat = codecs.open(f_cat, encoding='utf-8') s_ds = codecs.open(f_ds, encoding='utf-8') s_out = codecs.open(f_out, "w", encoding='utf-8') # store domain_scores for line in s_ds: line = line.strip() #proximal zone 5 1 1.841114 (term, freq, generic_freq, domain_score) = line.split("\t") d_term2ds[term] = float(domain_score) # categorized terms for line in s_cat: line = line.strip() l_fields = line.split("\t") term = l_fields[0] cat = l_fields[3] try: freq = int(l_fields[4]) except: print "[cat_filter]In line: %s" % line print "[cat_filter]Illegal integer in field 4: [%s][%s][%s][%s][%s][%s]" % ( l_fields[0], l_fields[1], l_fields[2], l_fields[3], l_fields[4], l_fields[5]) quit ds = d_term2ds[term] # filter and output if ds >= min_domain_score and (freq >= min_freq and freq <= max_freq): s_out.write("%s\t%s\t%i\t%f\n" % (term, cat, freq, ds)) s_cat.close() s_ds.close() s_out.close()
def run_steps(corpus, year, todo_list=["nb", "ds", "cf"], ranges=[[10, 100000, 1.5], [2, 10, 1.5]], cat_type="act", subset=""): #parameters code_root = roles_config.CODE_ROOT # path to corpus # corpus_root = code_root + "data/patents/" corpus1_size_file = pnames.tv_filepath(corpus_root, corpus, year, "cs", "", "") # generic corpus for domain specificity computation corpus2 = "ln-us-all-600k" corpus2_size_file = pnames.tv_filepath(corpus_root, corpus2, year, "cs", "", "") # read in the corpus sizes with open(corpus1_size_file, 'r') as f: corpus1_size = int(f.readline().strip("\n")) with open(corpus2_size_file, 'r') as f: corpus2_size = int(f.readline().strip("\n")) if "nb" in todo_list: # from .fc_kl, create act.cat.w0.0 print "[run_steps]step nb, Creating .cat.w0.0" run_classify(corpus, year, cat_type, subset) if "ds" in todo_list: # from , create .ds print "[run_steps]step ds, Creating .cat.w0.0_gt10_ds2" run_domain_score(corpus, corpus1_size, corpus2, corpus2_size, year) if "cf" in todo_list: # run cat_filter for each range for (min_freq, max_freq, min_domain_score) in ranges: # from .ds and act.cat.w0.0, create .cat.w0.0_gt5_ds2 print "[run_steps]step cf, Creating .act.cat.w0.0_gt?_ds?" #min_freq = 5 #min_domain_score = 2 run_cat_filter(corpus, year, min_freq, min_domain_score, max_freq, cat_type, subset) print "[run_steps]Reached end of todo_list"
def prepare_classify(corpus_root, corpus, year, cat_type, subset): #tf_file = pnames.tv_filepath(corpus_root, corpus, year, "tf", subset, cat_type) # try making cat_type empty for tf file tf_file = pnames.tv_filepath(corpus_root, corpus, year, "tf", subset, "") print "[prepare_classify]Preparing to open the .tf file: %s" % tf_file tf_lines = open(tf_file).readlines() print "[prepare_classify]Finished uploading .tf file!" termDict = {} print "Creating term dict..." count1 = 0 for line in tf_lines: if count1 % 100000 == 0: print count1 count1 += 1 fields = line.split('\t') term = '_'.join(fields[0].split()) feature = fields[1] count = fields[2] fc = feature+":"+count if term in termDict: termDict[term].append(fc) else: termDict[term] = [fc] class_input_file = pnames.tv_filepath(corpus_root, corpus, year, "unlab", subset, cat_type) print "class_input_file: %s" % class_input_file class_input = open(class_input_file, 'w') print "Writing into file..." print "Len of dict is"+ str(len(termDict)) for term in termDict: features = ' '.join(termDict[term]) class_input.write(term+'\t'+features+'\n') class_input.close()
def run_diff_score(corpus, year1, year2): # corpus_root = "/home/j/anick/patent-classifier/ontology/creation/data/patents/" outfile_years = str(year1) + "_" + str(year2) outfile = pnames.tv_filepath(corpus_root, corpus, outfile_years, "diff", "", "") f_terms1 = pnames.tv_filepath(corpus_root, corpus, year1, "terms", "", "") f_terms2 = pnames.tv_filepath(corpus_root, corpus, year2, "terms", "", "") cat_file = pnames.tv_filepath(corpus_root, corpus, year1, "cat.w0.0", "", "act") f_ds1 = pnames.tv_filepath(corpus_root, corpus, year1, "ds", "", "") # read in the corpus sizes y1_size_file = pnames.tv_filepath(corpus_root, corpus, year1, "cs", "", "") y2_size_file = pnames.tv_filepath(corpus_root, corpus, year2, "cs", "", "") y1_size = 0 y2_size = 0 with open(y1_size_file, 'r') as f: y1_size = int(f.readline().strip("\n")) with open(y2_size_file, 'r') as f: y2_size = int(f.readline().strip("\n")) diff_score(f_terms1, y1_size, f_terms2, y2_size, f_ds1, cat_file, outfile)
def prepare_train(corpus_root, corpus, year, cat_type, subset): tf_file = pnames.tv_filepath(corpus_root, corpus, year, "tf", subset, "") seed_file= pnames.tv_filepath(corpus_root, corpus, year, "tcs", subset, cat_type) #s_tf = open(tf_file) tf_lines = open(tf_file, 'r').readlines() print "[prepare_train]opening .tf file: %s " % tf_file s_tcs = open(seed_file) print "[prepare_train]opening .tcs (seed)file: %s" % seed_file termDict = {} for line in s_tcs: fields = line.split('\t') # replace spaces with "_" in terms since mallet parser uses spaces as separators term = '_'.join(fields[0].split()) ##print term # A term has a label and a list of features termDict[term] = [fields[1], []] print "[prepare_train]Done creating term dictionary" print "[prepare_train]Building feature dictionary..." count1 = 0 sortedKeys = sorted(termDict.keys()) for line in tf_lines: if count1 % 100000 == 0: print count1 count1 = count1 + 1 fields = line.split('\t') term = '_'.join(fields[0].split()) ##print term feat_val = (fields[1], fields[2]) if termDict.has_key(term): termDict[term][1].append(feat_val) print "Finished building feature dictionary!" print "Writing to file..." #s_tf.close() s_tcs.close() mallet_in_file = pnames.tv_filepath(corpus_root, corpus, year, "train", subset, cat_type) s_mallet_in = open(mallet_in_file, 'w') for term in termDict.keys(): s_mallet_in.write(term+'\t'+termDict[term][0]+'\t') if len(termDict[term][1]) == 0: print "No features to term! Oh no!!!!" for f_v in termDict[term][1]: s_mallet_in.write(f_v[0]+":"+f_v[1]) s_mallet_in.write(" ") s_mallet_in.write("\n") print "Created mallet_in file in directory!" # create mallet vectors file from .train data #/home/j/corpuswork/fuse/code/patent-classifier/tools/mallet/mallet-2.0.7/bin/csv2vectors --input myInput.train --output myInput.vectors # create classifier from .vectors # /home/j/corpuswork/fuse/code/patent-classifier/tools/mallet/mallet-2.0.7/bin/vectors2classify --input myInput.vectors --training-portion 0.9 --trainer NaiveBayes --output-classifier <file>.NBclassifier > <file>.mallet_stats s_mallet_in.close()
def feat_probs(corpus, cohort_year, prob_year, feature_year): # create file names cohort_file = pnames.tv_filepath(corpus_root, corpus, cohort_year, "cohort.filt.gold", "", "") tf_file = pnames.tv_filepath(corpus_root, corpus, prob_year, "tf", "", "") feats_file = pnames.tv_filepath(corpus_root, corpus, feature_year, "feats.1000", "", "") year_offset = prob_year - cohort_year offset_probs_str = str(year_offset) + ".probs" fgt_file = pnames.tv_filepath(corpus_root, corpus, cohort_year, offset_probs_str, "", "") print "[cohort.py feat_probs]]Writing to: %s" % fgt_file s_cohort_file = codecs.open(cohort_file, encoding='utf-8') s_tf_file = codecs.open(tf_file, encoding='utf-8') s_feats_file = codecs.open(feats_file, encoding='utf-8') s_fgt_file = codecs.open(fgt_file, "w", encoding='utf-8') #dictionaries # sum of probs for feature given cohort term d_feat2sum_prob_fgct = collections.defaultdict(int) # sum of probs for feature given any term d_feat2sum_prob_fgt = collections.defaultdict(int) # count of number terms contributing to the sum of probs, so # that we can divide by the count to calculate the average. d_feat2_count_fgct = collections.defaultdict(int) d_feat2_count_fgt = collections.defaultdict(int) # Boolean dictionaries to keep track of sets of items # features of interest d_feats = {} # terms in gold cohort d_cohort = {} # terms in corpus d_all_terms = {} # import features for line in s_feats_file: line = line.strip() l_fields = line.split("\t") feat = l_fields[0] d_feats[feat] = True # import gold_cohort for line in s_cohort_file: line = line.strip() # first line is info about the thresholds for the cohort growth if line[0] != "#": l_fields = line.split("\t") term = l_fields[0] d_cohort[term] = True #pdb.set_trace() # import f|t probs for line in s_tf_file: line = line.strip() l_fields = line.split("\t") term = l_fields[0] feat = l_fields[1] # keep track of all terms seen to count them later d_all_terms[term] = True if d_feats.has_key(feat): # if this is a feature we are interested in prob = float(l_fields[4]) # add its prob to the total for this feature, # given any term d_feat2sum_prob_fgt[feat] = prob + d_feat2sum_prob_fgt[feat] #d_feat2_count_fgt[feat] += 1 # if the term is a cohort term, also add the prob # to the total for feature given cohort term if d_cohort.has_key(term): d_feat2sum_prob_fgct[feat] = prob + d_feat2sum_prob_fgct[feat] #d_feat2_count_fgct[feat] += 1 #pdb.set_trace() # output probs count_all_terms = len(d_all_terms.keys()) count_gold_terms = len(d_cohort.keys()) print "[cohort.py] total terms in corpus: %i, in gold set: %i" % ( count_all_terms, count_gold_terms) for feat in d_feats.keys(): average_prob_fgt = float(d_feat2sum_prob_fgt[feat]) / count_all_terms average_prob_fgct = float( d_feat2sum_prob_fgct[feat]) / count_gold_terms diff = average_prob_fgct - average_prob_fgt ratio = average_prob_fgct / average_prob_fgt s_fgt_file.write( "%s\t%f\t%f\t%f\t%f\n" % (feat, average_prob_fgct, average_prob_fgt, diff, ratio)) s_cohort_file.close() s_tf_file.close() s_feats_file.close() s_fgt_file.close()
def filter_tf_file(corpus_root, corpus, year, act_file_type): #tf_file = tv_root + str(year) + ".tf" tfa_subset = "a" tft_subset = "t" tf_file = pnames.tv_filepath(corpus_root, corpus, year, "tf", "", cat_type="") tfa_file = pnames.tv_filepath(corpus_root, corpus, year, "tf", tfa_subset, cat_type="") tft_file = pnames.tv_filepath(corpus_root, corpus, year, "tf", tft_subset, cat_type="") print "[filter_tf_file]Creating tfa_file: %s" % tfa_file print "[filter_tf_file]Creating tft_file: %s" % tft_file act_file = pnames.tv_filepath(corpus_root, corpus, year, act_file_type, "", "act") print "[filter_tf_file]Reading from act_file: %s" % act_file s_tfa = codecs.open(tfa_file, "w", encoding='utf-8') s_tft = codecs.open(tft_file, "w", encoding='utf-8') d_term2cat = defaultdict(str) # store the category of each term labeled a and t s_act_file = codecs.open(act_file, encoding='utf-8') for line in s_act_file: line = line.strip("\n") l_fields = line.split("\t") term = l_fields[0] cat = l_fields[3] d_term2cat[term] = cat #print "term: %s, cat: %s" % (term, cat) s_act_file.close() # create subset files of .tf for the a and t terms s_tf_file = codecs.open(tf_file, encoding='utf-8') for line in s_tf_file: # don't bother to strip off newline # just grab the term term = line.split("\t")[0] cat = d_term2cat[term] if cat == "a": s_tfa.write(line) elif cat == "t": s_tft.write(line) s_tf_file.close() s_tfa.close() s_tft.close()