def dump_gen_rfields_filtered(es_result_generator,
                              l_fieldnames=[],
                              l_fieldtypes=[],
                              delist_fields_p=True,
                              output_file="rfields.out",
                              result_type="hits"):
    s_output = codecs.open(output_file, "w", encoding='utf-8')
    res_count = 0
    for l_result in es_result_generator:
        rf = rfields(l_result,
                     l_fieldnames,
                     delist_fields_p=delist_fields_p,
                     result_type=result_type)

        # test legality of phrase field (first field in l_fieldnames)
        # only output lines containing legal phrases

        for res in rf:
            if not (canon.illegal_phrase_p(res[0])):
                i = 0
                for field in res:
                    type_string = "%" + l_fieldtypes[i] + "\t"
                    s_output.write(type_string % field)
                    i += 1
                s_output.write("\n")
                res_count += 1
        print "[dump_gen_rfields]%i results written to %s" % (res_count,
                                                              output_file)
    print "[dump_gen_rfields]Completed: %i results written to %s" % (
        res_count, output_file)
    s_output.close()
Пример #2
0
def file2ngram_info(infile, min_len, max_len):
    """ Given a d3_feats file, return a list of tab separated strings of the form:
    <ngram_length> <canonicalized term> <surface term> <doc_id> <pos_signature>
    e.g., 3       epitaxial silicon process       epitaxial silicon processes     000171485800006 JNN
    NOTE: All elements are returned as strings, including the <ngram_length>
    min_len and max_len constrain the length of ngrams to be included in output.
    """

    #print("[file2ngram_info] %s" % infile) ///
    s_infile = gzopen.gzopen(infile)
    # list of lists of info to be returned for each line of input file
    l_term_info = []
    for line in s_infile:
        line = line.strip("\n")
        l_fields = line.split("\t")
        filename = l_fields[0]
        doc_id = path_base_name(filename)
        term = l_fields[2]
        ngram_len = len(term.split(" "))

        # continue if conditions for the term are met (ngram length and filter check)
        if (ngram_len >= min_len) and (ngram_len <= max_len) and not(canon.illegal_phrase_p(term)) :

            canon_np = can.get_canon_np(term)
            # We assume that the last feature on the line is tag_sig!
            pos_sig = l_fields[-1]
            if pos_sig[:7] != "tag_sig":
                print ("[ngram_extract.py]Error: last feature on input line is not labeled tag_sig")
                print ("line: %s" % line)
                sys.exit()
            else:
                # replace pos_sig with a string made of the first char of each pos in the phrase
                # e.g. JJ_NN_NNS => JNN
                pos_sig = "".join(item[0] for item in pos_sig[8:].split("_"))

                prev_Npr = ""
                prev_N = ""
                # grab the prev_Npr feature, if there is one
                try:
                    # extract the value of the prev_Npr feature, if there is one.
                    match = re.search(r'prev_Npr=(\S+)	', line)
                    prev_Npr = match.group(1)
                    # canonicalize the noun
                    prev_N = can.get_canon_np(prev_Npr.split("_")[0])
                except:
                    pass

                l_term_info.append([str(ngram_len), canon_np, term, doc_id, pos_sig, prev_Npr, prev_N])

    s_infile.close()
    return(l_term_info)
Пример #3
0
def dir2features_count(filelist_file,
                       out_root,
                       sections,
                       year,
                       overwrite_p,
                       max_doc_terms_count=1000,
                       canonicalize_p=True,
                       filter_noise_p=True):
    #pdb.set_trace()
    out_path = "/".join([out_root, sections])
    out_path_prefix = "/".join([out_path, year])
    # term-feature output file
    tf_file = out_path_prefix + ".tf"
    # remember the mapping between surface head nouns and their canonicalized forms
    canon_file = out_path_prefix + ".canon"

    # create the outpath if it doesn't exist yet
    print("[act_tf.py]creating path: %s,\n[act_tf.py]writing to %s" %
          (out_path, tf_file))

    try:
        # create directory path for corpus, if it does not aleady exist
        os.makedirs(out_path)
    except:
        print("[act_tf.py]NOTE: Path already exists (or cannot be created).")

    # Do not continue if the .tf file already exists for this corpus and year
    if os.path.isfile(tf_file) and not overwrite_p:
        print "[tf.py]file already exists: %s.  No need to recompute." % tf_file
    else:

        terms_file = out_path_prefix + ".terms"
        feats_file = out_path_prefix + ".feats"
        corpus_size_file = out_path_prefix + ".cs"
        doc_terms_file = out_path_prefix + ".doc_terms"
        # store each filename with a list of its terms
        s_doc_terms_file = codecs.open(doc_terms_file, "w", encoding='utf-8')

        # count of number of docs a term pair cooccurs in
        # dfreq is document freq, cfreq is corpus freq
        #d_pair_freq = defaultdict(int)
        d_pair2dfreq = defaultdict(int)
        # corpus count for the pair
        d_pair2cfreq = defaultdict(int)
        # count of number of docs a term occurs in
        #d_term_freq = defaultdict(int)
        d_term2dfreq = defaultdict(int)
        # count of number of instances of a term
        #d_term_instance_freq = defaultdict(int)
        d_term2cfreq = defaultdict(int)
        # count of number of instances of a feature
        #d_feat_instance_freq = defaultdict(int)
        d_feat2cfreq = defaultdict(int)
        # count of number of docs a feature occurs in
        #d_feat_freq = defaultdict(int)
        d_feat2dfreq = defaultdict(int)

        # doc_count needed for computing probs
        doc_count = 0

        # open list of all the files in the inroot directory
        s_filelist = open(filelist_file)

        #print "inroot: %s, filelist: %s" % (inroot, filelist)

        # iterate through files in filelist
        for infile in s_filelist:
            infile = infile.strip("\n")

            # Create a tab separated string containing the filename and all (legal) canonicalized terms, including
            # duplicates.  This will be used to populate a doc_term retrieval system in
            # elasticsearch.
            # First field will be the filename.
            # At this point, we'll collect the filename and terms into a list.
            # The file without path or extensions should be a unique doc id.
            doc_id = os.path.basename(infile).split(".")[0]
            doc_terms_list = [doc_id]

            # dictionaries to sum up statistics
            # number of times a term appears in the doc
            d_term2count = defaultdict(int)
            d_feat2count = defaultdict(int)
            # number of times a term appears with a specific feature in the doc
            d_pair2count = defaultdict(int)

            # process the dictionaries
            # for each file, create a set of all term-feature pairs in the file
            #/// dictionaries are functionally redundant with sets here.
            # Use sets to capture which terms, features, and pairs occur in the
            # document.  We'll use this after processing each doc to update the
            # doc frequencies of terms, features, and pairs.
            pair_set = set()
            term_set = set()
            feature_set = set()
            #pdb.set_trace()

            s_infile = gzopen.gzopen(infile)
            # count number of lines in file
            i = 0

            # iterate through lines in d3_feats file
            for term_line in s_infile:
                i += 1
                term_line = term_line.strip("\n")
                l_fields = term_line.split("\t")

                term = l_fields[2]

                # Do not process noise (illegal) terms or features
                #  for cases where feat = "", need to filter!  todo
                #pdb.set_trace()
                if (filter_noise_p and canon.illegal_phrase_p(term)):
                    pass

                # eliminate lines that come from claims section of patents.
                # These are not very useful and skew term frequency counts.
                # We do this by eliminating lines containing the feature section_loc=CLAIM*.
                if ("=CLAIM" in term_line):
                    pass

                # NOTE: At the moment we don't test which sections of the doc should be included
                # as specified by the sections parameter (ta or tas).  We include every line.  If
                # we decide to add this functionality, this would be the place to add the filter.

                else:

                    if canonicalize_p:
                        # Do canonicalization of term before incrementing counts
                        #feature = can.get_canon_feature(feature)
                        term = can.get_canon_np(term)

                    # increment the within doc count for the term
                    ##d_term2count[term] += 1
                    term_set.add(term)
                    # increment the global corpus count for the term
                    d_term2cfreq[term] += 1

                    # Add the term to the list of terms for the current doc
                    # Ideally, we would like to ignore parts of a patent (e.g. the claims) and
                    # just use the title, abstract and summary.  However, there is no feature
                    # indicating what section we are in beyond the abstract.  So instead, we
                    # will use a simple doc_terms_count cut off (e.g. 1000). Variable i counts
                    # the number of lines so far.

                    #pdb.set_trace()
                    if (i <= max_doc_terms_count) and (
                            term not in DOC_TERMS_NOISE
                    ) and not canon.illegal_phrase_p(term):
                        doc_terms_list.append(term)

                    # fields 3 and beyond are feature-value pairs
                    # look for features of interest using their prefixes
                    for feature in l_fields[3:]:
                        # Note that we use the prefixes of some feature names for convenience.
                        # The actual features are prev_V, prev_VNP, prev_J, prev_Jpr, prev_Npr, last_word
                        # first_word, if an adjective, may capture some indicators of dimensions (high, low), although
                        # many common adjectives are excluded from the chunk and would be matched by prev_J.
                        # we also pull out the sent and token locations to allow us to locate the full sentence for this
                        # term-feature instance.
                        if (feature[0:6] in [
                                "prev_V", "prev_J", "prev_N", "last_w"
                        ]) and not canon.illegal_feature_p(feature):

                            if canonicalize_p and not "-" in feature:
                                # Do canonicalization of feature before incrementing counts.
                                # NOTE: There is a bug in the canonicalization code when the
                                # term contains hyphens. For example:
                                # >>> can.get_canon_feature("last_word=compass-on-a-chip")
                                # Returns a term with a blank in it: 'last_word=compas-on-a chip'
                                # for this reason, we will not try to canonicalize terms containing
                                # a hyphen.

                                feature = can.get_canon_feature(feature)

                            # increment global corpus count for the feature
                            d_feat2cfreq[feature] += 1

                            feature_set.add(feature)
                            # increment global corpus count for the pair
                            d_pair2cfreq[(term, feature)] += 1
                            # increment the within doc count for the term feature pair
                            ##d_pair2count[(term, feature)] += 1
                            pair_set.add((term, feature))

            # construct a tab-separated string containing file_name and all terms
            doc_terms_str = "\t".join(doc_terms_list)

            s_doc_terms_file.write("%s\n" % doc_terms_str)

            s_infile.close()

            # Using the sets, increment the doc_freq for term-feature pairs in the doc.
            # By making the list a set, we know we are only counting each term-feature combo once
            # per document
            for pair in pair_set:
                d_pair2dfreq[pair] += 1

            # also increment doc_freq for features and terms

            for term in term_set:
                d_term2dfreq[term] += 1

            for feature in feature_set:
                d_feat2dfreq[feature] += 1

            # track total number of docs
            doc_count += 1

        s_filelist.close()

        s_tf_file = codecs.open(tf_file, "w", encoding='utf-8')
        s_terms_file = codecs.open(terms_file, "w", encoding='utf-8')
        s_feats_file = codecs.open(feats_file, "w", encoding='utf-8')
        print "[act_tf.py]Writing to %s" % tf_file

        # compute prob
        print "[act_tf.py]Processed %i files" % doc_count

        for pair in d_pair2dfreq.keys():
            freq_pair = d_pair2dfreq[pair]
            prob_pair = float(freq_pair) / doc_count

            term = pair[0]

            feature = pair[1]
            freq_term = d_term2dfreq[term]
            freq_feat = d_feat2dfreq[feature]

            # Occasionally, we come across a term in freq_pair which is not actually in
            # the dictionary d_term2dfreq.  It returns a freq of 0.  We need to ignore these
            # cases, since they will create a divide by 0 error.
            if freq_term > 0 and freq_feat > 0:

                # probability of the feature occurring with the term in a doc, given that
                # the term appears in the doc
                try:
                    prob_fgt = freq_pair / float(freq_term)
                except:
                    pdb.set_trace()

                # added 4/4/15: prob of the feature occurring with the term in a doc, given that
                # the feature appears in the doc
                try:
                    prob_tgf = freq_pair / float(freq_feat)
                except:
                    pdb.set_trace()

                # 4/18/15 adding mutual information based on count of pairs, terms, feats (counted once per doc),
                # and corpus size (# docs)
                # MI = prob(pair) / prob(term) * prob(feature)
                #prob_term = float(d_term2dfreq[term])/doc_count
                #prob_feature = float(d_feat2dfreq[term])/doc_count
                mi_denom = (freq_term) * (freq_feat) / float(doc_count)
                mi = math.log(freq_pair / mi_denom)
                # normalize to -1 to 1
                # Note: if prob_pair == 1, then log is 0 and we risk dividing by 0
                # We'll prevent this by subtracting a small amt from prob_pair
                if prob_pair == 1:
                    prob_pair = prob_pair - .000000001
                npmi = mi / (-math.log(prob_pair))
                s_tf_file.write("%s\t%s\t%i\t%f\t%f\t%f\t%i\t%i\t%f\t%f\n" %
                                (term, feature, freq_pair, prob_pair, prob_fgt,
                                 prob_tgf, freq_term, freq_feat, mi, npmi))

            else:
                # print out a warning about terms with 0 freq.
                print "[act_tf.py]WARNING: term-feature pair: %s has freq = 0. Ignored." % l_pair

        for term in d_term2dfreq.keys():
            term_prob = float(d_term2dfreq[term]) / doc_count
            s_terms_file.write(
                "%s\t%i\t%i\t%f\n" %
                (term, d_term2dfreq[term], d_term2cfreq[term], term_prob))

        for feat in d_feat2dfreq.keys():
            feat_prob = float(d_feat2dfreq[feat]) / doc_count
            s_feats_file.write(
                "%s\t%i\t%i\t%f\n" %
                (feat, d_feat2dfreq[feat], d_feat2cfreq[feat], feat_prob))

        s_canon_file = codecs.open(canon_file, "w", encoding='utf-8')
        for key, value in can.d_n2canon.items():
            # Only write out a line if the canonical form differs from the surface form
            if key != value:
                s_canon_file.write("%s\t%s\n" % (key, value))
        s_canon_file.close()

        s_tf_file.close()
        s_terms_file.close()
        s_feats_file.close()

        s_doc_terms_file.close()

        # Finally, create a file to store the corpus size (# docs in the source directory)
        cmd = "ls -1 " + filelist_file + " | wc -l > " + corpus_size_file

        s_corpus_size_file = open(corpus_size_file, "w")
        s_corpus_size_file.write("%i\n" % doc_count)
        s_corpus_size_file.close()
        print "[act_tf.py dir2features_count]Storing corpus size in %s " % corpus_size_file
Пример #4
0
def dir2features_count(inroot,
                       outroot,
                       year,
                       canonicalize_p=True,
                       filter_noise_p=True):
    outfilename = str(year)
    # term-feature output file
    outfile = outroot + outfilename + ".tf"

    # Do not continue if the .tf file already exists for this corpus and year
    if os.path.isfile(outfile):
        print "[tf.py]file already exists: %s.  No need to recompute." % outfile
    else:

        terms_file = outroot + outfilename + ".terms"
        feats_file = outroot + outfilename + ".feats"
        corpus_size_file = outroot + outfilename + ".cs"

        # count of number of docs a term pair cooccurs in
        d_pair_freq = collections.defaultdict(int)
        # count of number of docs a term occurs in
        d_term_freq = collections.defaultdict(int)
        # count of number of instances of a term
        d_term_instance_freq = collections.defaultdict(int)
        # count of number of instances of a feature
        d_feat_instance_freq = collections.defaultdict(int)
        # count of number of docs a feature occurs in
        d_feat_freq = collections.defaultdict(int)

        # Be safe, check if outroot path exists, and create it if not
        if not os.path.exists(outroot):
            os.makedirs(outroot)
            print "Created outroot dir: %s" % outroot

        # doc_count needed for computing probs
        doc_count = 0

        # make a list of all the files in the inroot directory
        filelist = glob.glob(inroot + "/*")

        #print "inroot: %s, filelist: %s" % (inroot, filelist)

        for infile in filelist:

            # process the term files
            # for each file, create a set of all term-feature pairs in the file
            pair_set = set()
            term_set = set()
            feature_set = set()
            #pdb.set_trace()
            s_infile = codecs.open(infile, encoding='utf-8')
            i = 0
            for term_line in s_infile:
                i += 1

                term_line = term_line.strip("\n")
                l_fields = term_line.split("\t")
                term = l_fields[0]
                feature = l_fields[1]
                term_feature_within_doc_count = int(l_fields[2])
                #print "term: %s, feature: %s" % (term, feature)
                """
                # filter out non alphabetic phrases, noise terms
                if alpha_phrase_p(term):
                    pair = term + "\t" + feature
                    print "term matches: %s, pair is: %s" % (term, pair)
                    pair_set.add(pair)
                """

                # if the feature field is "", then we use this line to count term
                # instances
                if feature == "":

                    if (filter_noise_p and canon.illegal_phrase_p(term)):
                        pass
                    else:
                        if canonicalize_p:
                            # Do canonicalization of term before incrementing counts
                            # note we don't canonicalize feature here since feature == ""
                            term = can.get_canon_np(term)

                        d_term_instance_freq[
                            term] += term_feature_within_doc_count
                        # add term to set for this document to accumulate term-doc count
                        term_set.add(term)
                        # note:  In ln-us-cs-500k 1997.tf, it appears that one term (e.g. u'y \u2033')
                        # does not get added to the set.  Perhaps the special char is treated as the same
                        # as another term and therefore is excluded from the set add.  As a result
                        # the set of terms in d_term_freq may be missing some odd terms that occur in .tf.
                        # Later will will use terms from .tf as keys into d_term_freq, so we have to allow for
                        # an occasional missing key at that point (in nbayes.py)
                else:
                    # the line is a term_feature pair
                    # (filter_noise_p should be False to handle chinese)

                    # Do not process noise (illegal) terms or features
                    #///  for cases where feat = "", need to filter!  todo
                    #pdb.set_trace()
                    if (filter_noise_p and canon.illegal_phrase_p(term)
                        ) or canon.illegal_feature_p(feature):
                        pass

                    else:

                        if canonicalize_p:
                            # Do canonicalization of term and feature before incrementing counts
                            feature = can.get_canon_feature(feature)
                            term = can.get_canon_np(term)

                        #pdb.set_trace()
                        pair = term + "\t" + feature
                        ##print "term matches: %s, pair is: %s" % (term, pair)
                        pair_set.add(pair)
                        feature_set.add(feature)
                        d_feat_instance_freq[
                            feature] += term_feature_within_doc_count

                        #print "pair: %s, term: %s, feature: %s" % (pair, term, feature)
                        #pdb.set_trace()

            s_infile.close()

            # increment the doc_freq for term-feature pairs in the doc
            # By making the list a set, we know we are only counting each term-feature combo once
            # per document
            for pair in pair_set:
                d_pair_freq[pair] += 1

            # also increment doc_freq for features and terms

            for term in term_set:
                d_term_freq[term] += 1

            for feature in feature_set:
                d_feat_freq[feature] += 1

            # track total number of docs
            doc_count += 1

        s_outfile = codecs.open(outfile, "w", encoding='utf-8')
        s_terms_file = codecs.open(terms_file, "w", encoding='utf-8')
        s_feats_file = codecs.open(feats_file, "w", encoding='utf-8')
        print "Writing to %s" % outfile

        # compute prob
        print "Processed %i files" % doc_count

        for pair in d_pair_freq.keys():
            freq_pair = d_pair_freq[pair]
            prob_pair = float(freq_pair) / doc_count
            l_pair = pair.split("\t")
            term = l_pair[0]
            #print "term after split: %s, pair is: %s" % (term, pair)
            feature = l_pair[1]
            freq_term = d_term_freq[term]
            freq_feat = d_feat_freq[feature]

            # probability of the feature occurring with the term in a doc, given that
            # the term appears in the doc
            try:
                prob_fgt = freq_pair / float(freq_term)
            except:
                pdb.set_trace()

            # added 4/4/15: prob of the feature occurring with the term in a doc, given that
            # the feature appears in the doc
            try:
                prob_tgf = freq_pair / float(freq_feat)
            except:
                pdb.set_trace()

            # 4/18/15 adding mutual information based on count of pairs, terms, feats (counted once per doc),
            # and corpus size (# docs)
            # MI = prob(pair) / prob(term) * prob(feature)
            #prob_term = float(d_term_freq[term])/doc_count
            #prob_feature = float(d_feat_freq[term])/doc_count
            mi_denom = (freq_term) * (freq_feat) / float(doc_count)
            mi = math.log(freq_pair / mi_denom)
            # normalize to -1 to 1
            npmi = mi / (-math.log(prob_pair))
            s_outfile.write("%s\t%s\t%i\t%f\t%f\t%f\t%i\t%i\t%f\t%f\n" %
                            (term, feature, freq_pair, prob_pair, prob_fgt,
                             prob_tgf, freq_term, freq_feat, mi, npmi))

        # /// TODO: this table makes tf.f file redundant!  Replace use of tf.f
        for term in d_term_freq.keys():
            term_prob = float(d_term_freq[term]) / doc_count
            s_terms_file.write("%s\t%i\t%i\t%f\n" %
                               (term, d_term_freq[term],
                                d_term_instance_freq[term], term_prob))

        for feat in d_feat_freq.keys():
            feat_prob = float(d_feat_freq[feat]) / doc_count
            s_feats_file.write("%s\t%i\t%i\t%f\n" %
                               (feat, d_feat_freq[feat],
                                d_feat_instance_freq[feat], feat_prob))

        s_outfile.close()
        s_terms_file.close()
        s_feats_file.close()

        # Finally, create a file to store the corpus size (# docs in the source directory)
        cmd = "ls -1 " + inroot + " | wc -l > " + corpus_size_file
        print "[dir2features_count]Storing corpus size in %s " % corpus_size_file
        os.system(cmd)