def encode_review(review_id, base_dir="/home/byron/abstrackr-web/abstrackr/lib/curious_snake/data"): fields=["title", "abstract", "keywords"] base_dir = os.path.join(base_dir, str(review_id)) # write the abstracts to disk to_disk(base_dir, review_ids=[review_id], fields=fields) # now encode them lbl_d = pickle.load(open(os.path.join(base_dir, "labels.pickle"))) # we encode the three main fields in separate spaces (multi-view) for field in fields: print "\n\n\n(write_review_to_disk) on field: %s..." % field dir_path = os.path.join(base_dir, field) out_path = os.path.join(dir_path, "encoded") out_f_name = "%s_encoded" % field tfidf2.encode_docs(dir_path, out_path, out_f_name, \ lbl_dict=lbl_d, clean_first=True, binary=True, \ min_word_count=3, bi_grams_too=True) # fetch the encoded status entry for this review, # ***which we assume exists!*** return base_dir
def fetch_and_encode(article_ids, out_dir, binary_features=False, labels=None, fields = ["AB", "TI"], out_f_name = ""): ''' First fetches from the web, then encodes them. ''' # first, fetch the articles fetch_and_write_out(article_ids, out_dir, fields = fields) for field in fields: print "encoding %s..." %field # now, clean and encode them out_for_field = os.path.join(out_dir, field) tfidf2.encode_docs(out_for_field, os.path.join(out_for_field, "encoded"), out_f_name + field, lbl_dict = labels) print "finito."
def fetch_and_encode(article_ids, out_dir, binary_features=False, lbl_dict=None, fields = ["AB", "TI"], out_f_name = ""): ''' First fetches from the web, then encodes them. lbl_dict is assumed to be a dictionary mapping ids to labels. If it is not provided documents will be encoded with a "?" as their label. ''' # first, fetch the articles fetch_and_write_out(article_ids, out_dir, fields = fields) for field in fields: print "encoding %s..." %field # now, clean and encode them out_for_field = os.path.join(out_dir, field) tfidf2.encode_docs(out_for_field, os.path.join(out_for_field, "encoded"), out_f_name + field, lbl_dict=lbl_dict) print "finito."
def fetch_and_encode(article_ids, out_dir, binary_features=False, labels=None, fields=["AB", "TI"], out_f_name=""): ''' First fetches from the web, then encodes them. ''' # first, fetch the articles fetch_and_write_out(article_ids, out_dir, fields=fields) for field in fields: print "encoding %s..." % field # now, clean and encode them out_for_field = os.path.join(out_dir, field) tfidf2.encode_docs(out_for_field, os.path.join(out_for_field, "encoded"), out_f_name + field, lbl_dict=labels) print "finito."