def dump_gen_rfields_filtered(es_result_generator, l_fieldnames=[], l_fieldtypes=[], delist_fields_p=True, output_file="rfields.out", result_type="hits"): s_output = codecs.open(output_file, "w", encoding='utf-8') res_count = 0 for l_result in es_result_generator: rf = rfields(l_result, l_fieldnames, delist_fields_p=delist_fields_p, result_type=result_type) # test legality of phrase field (first field in l_fieldnames) # only output lines containing legal phrases for res in rf: if not (canon.illegal_phrase_p(res[0])): i = 0 for field in res: type_string = "%" + l_fieldtypes[i] + "\t" s_output.write(type_string % field) i += 1 s_output.write("\n") res_count += 1 print "[dump_gen_rfields]%i results written to %s" % (res_count, output_file) print "[dump_gen_rfields]Completed: %i results written to %s" % ( res_count, output_file) s_output.close()
def file2ngram_info(infile, min_len, max_len): """ Given a d3_feats file, return a list of tab separated strings of the form: <ngram_length> <canonicalized term> <surface term> <doc_id> <pos_signature> e.g., 3 epitaxial silicon process epitaxial silicon processes 000171485800006 JNN NOTE: All elements are returned as strings, including the <ngram_length> min_len and max_len constrain the length of ngrams to be included in output. """ #print("[file2ngram_info] %s" % infile) /// s_infile = gzopen.gzopen(infile) # list of lists of info to be returned for each line of input file l_term_info = [] for line in s_infile: line = line.strip("\n") l_fields = line.split("\t") filename = l_fields[0] doc_id = path_base_name(filename) term = l_fields[2] ngram_len = len(term.split(" ")) # continue if conditions for the term are met (ngram length and filter check) if (ngram_len >= min_len) and (ngram_len <= max_len) and not(canon.illegal_phrase_p(term)) : canon_np = can.get_canon_np(term) # We assume that the last feature on the line is tag_sig! pos_sig = l_fields[-1] if pos_sig[:7] != "tag_sig": print ("[ngram_extract.py]Error: last feature on input line is not labeled tag_sig") print ("line: %s" % line) sys.exit() else: # replace pos_sig with a string made of the first char of each pos in the phrase # e.g. JJ_NN_NNS => JNN pos_sig = "".join(item[0] for item in pos_sig[8:].split("_")) prev_Npr = "" prev_N = "" # grab the prev_Npr feature, if there is one try: # extract the value of the prev_Npr feature, if there is one. match = re.search(r'prev_Npr=(\S+) ', line) prev_Npr = match.group(1) # canonicalize the noun prev_N = can.get_canon_np(prev_Npr.split("_")[0]) except: pass l_term_info.append([str(ngram_len), canon_np, term, doc_id, pos_sig, prev_Npr, prev_N]) s_infile.close() return(l_term_info)
def dir2features_count(filelist_file, out_root, sections, year, overwrite_p, max_doc_terms_count=1000, canonicalize_p=True, filter_noise_p=True): #pdb.set_trace() out_path = "/".join([out_root, sections]) out_path_prefix = "/".join([out_path, year]) # term-feature output file tf_file = out_path_prefix + ".tf" # remember the mapping between surface head nouns and their canonicalized forms canon_file = out_path_prefix + ".canon" # create the outpath if it doesn't exist yet print("[act_tf.py]creating path: %s,\n[act_tf.py]writing to %s" % (out_path, tf_file)) try: # create directory path for corpus, if it does not aleady exist os.makedirs(out_path) except: print("[act_tf.py]NOTE: Path already exists (or cannot be created).") # Do not continue if the .tf file already exists for this corpus and year if os.path.isfile(tf_file) and not overwrite_p: print "[tf.py]file already exists: %s. No need to recompute." % tf_file else: terms_file = out_path_prefix + ".terms" feats_file = out_path_prefix + ".feats" corpus_size_file = out_path_prefix + ".cs" doc_terms_file = out_path_prefix + ".doc_terms" # store each filename with a list of its terms s_doc_terms_file = codecs.open(doc_terms_file, "w", encoding='utf-8') # count of number of docs a term pair cooccurs in # dfreq is document freq, cfreq is corpus freq #d_pair_freq = defaultdict(int) d_pair2dfreq = defaultdict(int) # corpus count for the pair d_pair2cfreq = defaultdict(int) # count of number of docs a term occurs in #d_term_freq = defaultdict(int) d_term2dfreq = defaultdict(int) # count of number of instances of a term #d_term_instance_freq = defaultdict(int) d_term2cfreq = defaultdict(int) # count of number of instances of a feature #d_feat_instance_freq = defaultdict(int) d_feat2cfreq = defaultdict(int) # count of number of docs a feature occurs in #d_feat_freq = defaultdict(int) d_feat2dfreq = defaultdict(int) # doc_count needed for computing probs doc_count = 0 # open list of all the files in the inroot directory s_filelist = open(filelist_file) #print "inroot: %s, filelist: %s" % (inroot, filelist) # iterate through files in filelist for infile in s_filelist: infile = infile.strip("\n") # Create a tab separated string containing the filename and all (legal) canonicalized terms, including # duplicates. This will be used to populate a doc_term retrieval system in # elasticsearch. # First field will be the filename. # At this point, we'll collect the filename and terms into a list. # The file without path or extensions should be a unique doc id. doc_id = os.path.basename(infile).split(".")[0] doc_terms_list = [doc_id] # dictionaries to sum up statistics # number of times a term appears in the doc d_term2count = defaultdict(int) d_feat2count = defaultdict(int) # number of times a term appears with a specific feature in the doc d_pair2count = defaultdict(int) # process the dictionaries # for each file, create a set of all term-feature pairs in the file #/// dictionaries are functionally redundant with sets here. # Use sets to capture which terms, features, and pairs occur in the # document. We'll use this after processing each doc to update the # doc frequencies of terms, features, and pairs. pair_set = set() term_set = set() feature_set = set() #pdb.set_trace() s_infile = gzopen.gzopen(infile) # count number of lines in file i = 0 # iterate through lines in d3_feats file for term_line in s_infile: i += 1 term_line = term_line.strip("\n") l_fields = term_line.split("\t") term = l_fields[2] # Do not process noise (illegal) terms or features # for cases where feat = "", need to filter! todo #pdb.set_trace() if (filter_noise_p and canon.illegal_phrase_p(term)): pass # eliminate lines that come from claims section of patents. # These are not very useful and skew term frequency counts. # We do this by eliminating lines containing the feature section_loc=CLAIM*. if ("=CLAIM" in term_line): pass # NOTE: At the moment we don't test which sections of the doc should be included # as specified by the sections parameter (ta or tas). We include every line. If # we decide to add this functionality, this would be the place to add the filter. else: if canonicalize_p: # Do canonicalization of term before incrementing counts #feature = can.get_canon_feature(feature) term = can.get_canon_np(term) # increment the within doc count for the term ##d_term2count[term] += 1 term_set.add(term) # increment the global corpus count for the term d_term2cfreq[term] += 1 # Add the term to the list of terms for the current doc # Ideally, we would like to ignore parts of a patent (e.g. the claims) and # just use the title, abstract and summary. However, there is no feature # indicating what section we are in beyond the abstract. So instead, we # will use a simple doc_terms_count cut off (e.g. 1000). Variable i counts # the number of lines so far. #pdb.set_trace() if (i <= max_doc_terms_count) and ( term not in DOC_TERMS_NOISE ) and not canon.illegal_phrase_p(term): doc_terms_list.append(term) # fields 3 and beyond are feature-value pairs # look for features of interest using their prefixes for feature in l_fields[3:]: # Note that we use the prefixes of some feature names for convenience. # The actual features are prev_V, prev_VNP, prev_J, prev_Jpr, prev_Npr, last_word # first_word, if an adjective, may capture some indicators of dimensions (high, low), although # many common adjectives are excluded from the chunk and would be matched by prev_J. # we also pull out the sent and token locations to allow us to locate the full sentence for this # term-feature instance. if (feature[0:6] in [ "prev_V", "prev_J", "prev_N", "last_w" ]) and not canon.illegal_feature_p(feature): if canonicalize_p and not "-" in feature: # Do canonicalization of feature before incrementing counts. # NOTE: There is a bug in the canonicalization code when the # term contains hyphens. For example: # >>> can.get_canon_feature("last_word=compass-on-a-chip") # Returns a term with a blank in it: 'last_word=compas-on-a chip' # for this reason, we will not try to canonicalize terms containing # a hyphen. feature = can.get_canon_feature(feature) # increment global corpus count for the feature d_feat2cfreq[feature] += 1 feature_set.add(feature) # increment global corpus count for the pair d_pair2cfreq[(term, feature)] += 1 # increment the within doc count for the term feature pair ##d_pair2count[(term, feature)] += 1 pair_set.add((term, feature)) # construct a tab-separated string containing file_name and all terms doc_terms_str = "\t".join(doc_terms_list) s_doc_terms_file.write("%s\n" % doc_terms_str) s_infile.close() # Using the sets, increment the doc_freq for term-feature pairs in the doc. # By making the list a set, we know we are only counting each term-feature combo once # per document for pair in pair_set: d_pair2dfreq[pair] += 1 # also increment doc_freq for features and terms for term in term_set: d_term2dfreq[term] += 1 for feature in feature_set: d_feat2dfreq[feature] += 1 # track total number of docs doc_count += 1 s_filelist.close() s_tf_file = codecs.open(tf_file, "w", encoding='utf-8') s_terms_file = codecs.open(terms_file, "w", encoding='utf-8') s_feats_file = codecs.open(feats_file, "w", encoding='utf-8') print "[act_tf.py]Writing to %s" % tf_file # compute prob print "[act_tf.py]Processed %i files" % doc_count for pair in d_pair2dfreq.keys(): freq_pair = d_pair2dfreq[pair] prob_pair = float(freq_pair) / doc_count term = pair[0] feature = pair[1] freq_term = d_term2dfreq[term] freq_feat = d_feat2dfreq[feature] # Occasionally, we come across a term in freq_pair which is not actually in # the dictionary d_term2dfreq. It returns a freq of 0. We need to ignore these # cases, since they will create a divide by 0 error. if freq_term > 0 and freq_feat > 0: # probability of the feature occurring with the term in a doc, given that # the term appears in the doc try: prob_fgt = freq_pair / float(freq_term) except: pdb.set_trace() # added 4/4/15: prob of the feature occurring with the term in a doc, given that # the feature appears in the doc try: prob_tgf = freq_pair / float(freq_feat) except: pdb.set_trace() # 4/18/15 adding mutual information based on count of pairs, terms, feats (counted once per doc), # and corpus size (# docs) # MI = prob(pair) / prob(term) * prob(feature) #prob_term = float(d_term2dfreq[term])/doc_count #prob_feature = float(d_feat2dfreq[term])/doc_count mi_denom = (freq_term) * (freq_feat) / float(doc_count) mi = math.log(freq_pair / mi_denom) # normalize to -1 to 1 # Note: if prob_pair == 1, then log is 0 and we risk dividing by 0 # We'll prevent this by subtracting a small amt from prob_pair if prob_pair == 1: prob_pair = prob_pair - .000000001 npmi = mi / (-math.log(prob_pair)) s_tf_file.write("%s\t%s\t%i\t%f\t%f\t%f\t%i\t%i\t%f\t%f\n" % (term, feature, freq_pair, prob_pair, prob_fgt, prob_tgf, freq_term, freq_feat, mi, npmi)) else: # print out a warning about terms with 0 freq. print "[act_tf.py]WARNING: term-feature pair: %s has freq = 0. Ignored." % l_pair for term in d_term2dfreq.keys(): term_prob = float(d_term2dfreq[term]) / doc_count s_terms_file.write( "%s\t%i\t%i\t%f\n" % (term, d_term2dfreq[term], d_term2cfreq[term], term_prob)) for feat in d_feat2dfreq.keys(): feat_prob = float(d_feat2dfreq[feat]) / doc_count s_feats_file.write( "%s\t%i\t%i\t%f\n" % (feat, d_feat2dfreq[feat], d_feat2cfreq[feat], feat_prob)) s_canon_file = codecs.open(canon_file, "w", encoding='utf-8') for key, value in can.d_n2canon.items(): # Only write out a line if the canonical form differs from the surface form if key != value: s_canon_file.write("%s\t%s\n" % (key, value)) s_canon_file.close() s_tf_file.close() s_terms_file.close() s_feats_file.close() s_doc_terms_file.close() # Finally, create a file to store the corpus size (# docs in the source directory) cmd = "ls -1 " + filelist_file + " | wc -l > " + corpus_size_file s_corpus_size_file = open(corpus_size_file, "w") s_corpus_size_file.write("%i\n" % doc_count) s_corpus_size_file.close() print "[act_tf.py dir2features_count]Storing corpus size in %s " % corpus_size_file
def dir2features_count(inroot, outroot, year, canonicalize_p=True, filter_noise_p=True): outfilename = str(year) # term-feature output file outfile = outroot + outfilename + ".tf" # Do not continue if the .tf file already exists for this corpus and year if os.path.isfile(outfile): print "[tf.py]file already exists: %s. No need to recompute." % outfile else: terms_file = outroot + outfilename + ".terms" feats_file = outroot + outfilename + ".feats" corpus_size_file = outroot + outfilename + ".cs" # count of number of docs a term pair cooccurs in d_pair_freq = collections.defaultdict(int) # count of number of docs a term occurs in d_term_freq = collections.defaultdict(int) # count of number of instances of a term d_term_instance_freq = collections.defaultdict(int) # count of number of instances of a feature d_feat_instance_freq = collections.defaultdict(int) # count of number of docs a feature occurs in d_feat_freq = collections.defaultdict(int) # Be safe, check if outroot path exists, and create it if not if not os.path.exists(outroot): os.makedirs(outroot) print "Created outroot dir: %s" % outroot # doc_count needed for computing probs doc_count = 0 # make a list of all the files in the inroot directory filelist = glob.glob(inroot + "/*") #print "inroot: %s, filelist: %s" % (inroot, filelist) for infile in filelist: # process the term files # for each file, create a set of all term-feature pairs in the file pair_set = set() term_set = set() feature_set = set() #pdb.set_trace() s_infile = codecs.open(infile, encoding='utf-8') i = 0 for term_line in s_infile: i += 1 term_line = term_line.strip("\n") l_fields = term_line.split("\t") term = l_fields[0] feature = l_fields[1] term_feature_within_doc_count = int(l_fields[2]) #print "term: %s, feature: %s" % (term, feature) """ # filter out non alphabetic phrases, noise terms if alpha_phrase_p(term): pair = term + "\t" + feature print "term matches: %s, pair is: %s" % (term, pair) pair_set.add(pair) """ # if the feature field is "", then we use this line to count term # instances if feature == "": if (filter_noise_p and canon.illegal_phrase_p(term)): pass else: if canonicalize_p: # Do canonicalization of term before incrementing counts # note we don't canonicalize feature here since feature == "" term = can.get_canon_np(term) d_term_instance_freq[ term] += term_feature_within_doc_count # add term to set for this document to accumulate term-doc count term_set.add(term) # note: In ln-us-cs-500k 1997.tf, it appears that one term (e.g. u'y \u2033') # does not get added to the set. Perhaps the special char is treated as the same # as another term and therefore is excluded from the set add. As a result # the set of terms in d_term_freq may be missing some odd terms that occur in .tf. # Later will will use terms from .tf as keys into d_term_freq, so we have to allow for # an occasional missing key at that point (in nbayes.py) else: # the line is a term_feature pair # (filter_noise_p should be False to handle chinese) # Do not process noise (illegal) terms or features #/// for cases where feat = "", need to filter! todo #pdb.set_trace() if (filter_noise_p and canon.illegal_phrase_p(term) ) or canon.illegal_feature_p(feature): pass else: if canonicalize_p: # Do canonicalization of term and feature before incrementing counts feature = can.get_canon_feature(feature) term = can.get_canon_np(term) #pdb.set_trace() pair = term + "\t" + feature ##print "term matches: %s, pair is: %s" % (term, pair) pair_set.add(pair) feature_set.add(feature) d_feat_instance_freq[ feature] += term_feature_within_doc_count #print "pair: %s, term: %s, feature: %s" % (pair, term, feature) #pdb.set_trace() s_infile.close() # increment the doc_freq for term-feature pairs in the doc # By making the list a set, we know we are only counting each term-feature combo once # per document for pair in pair_set: d_pair_freq[pair] += 1 # also increment doc_freq for features and terms for term in term_set: d_term_freq[term] += 1 for feature in feature_set: d_feat_freq[feature] += 1 # track total number of docs doc_count += 1 s_outfile = codecs.open(outfile, "w", encoding='utf-8') s_terms_file = codecs.open(terms_file, "w", encoding='utf-8') s_feats_file = codecs.open(feats_file, "w", encoding='utf-8') print "Writing to %s" % outfile # compute prob print "Processed %i files" % doc_count for pair in d_pair_freq.keys(): freq_pair = d_pair_freq[pair] prob_pair = float(freq_pair) / doc_count l_pair = pair.split("\t") term = l_pair[0] #print "term after split: %s, pair is: %s" % (term, pair) feature = l_pair[1] freq_term = d_term_freq[term] freq_feat = d_feat_freq[feature] # probability of the feature occurring with the term in a doc, given that # the term appears in the doc try: prob_fgt = freq_pair / float(freq_term) except: pdb.set_trace() # added 4/4/15: prob of the feature occurring with the term in a doc, given that # the feature appears in the doc try: prob_tgf = freq_pair / float(freq_feat) except: pdb.set_trace() # 4/18/15 adding mutual information based on count of pairs, terms, feats (counted once per doc), # and corpus size (# docs) # MI = prob(pair) / prob(term) * prob(feature) #prob_term = float(d_term_freq[term])/doc_count #prob_feature = float(d_feat_freq[term])/doc_count mi_denom = (freq_term) * (freq_feat) / float(doc_count) mi = math.log(freq_pair / mi_denom) # normalize to -1 to 1 npmi = mi / (-math.log(prob_pair)) s_outfile.write("%s\t%s\t%i\t%f\t%f\t%f\t%i\t%i\t%f\t%f\n" % (term, feature, freq_pair, prob_pair, prob_fgt, prob_tgf, freq_term, freq_feat, mi, npmi)) # /// TODO: this table makes tf.f file redundant! Replace use of tf.f for term in d_term_freq.keys(): term_prob = float(d_term_freq[term]) / doc_count s_terms_file.write("%s\t%i\t%i\t%f\n" % (term, d_term_freq[term], d_term_instance_freq[term], term_prob)) for feat in d_feat_freq.keys(): feat_prob = float(d_feat_freq[feat]) / doc_count s_feats_file.write("%s\t%i\t%i\t%f\n" % (feat, d_feat_freq[feat], d_feat_instance_freq[feat], feat_prob)) s_outfile.close() s_terms_file.close() s_feats_file.close() # Finally, create a file to store the corpus size (# docs in the source directory) cmd = "ls -1 " + inroot + " | wc -l > " + corpus_size_file print "[dir2features_count]Storing corpus size in %s " % corpus_size_file os.system(cmd)