예제 #1
0
def encode_review(review_id, base_dir="/home/byron/abstrackr-web/abstrackr/lib/curious_snake/data"):
    fields=["title", "abstract", "keywords"]
    
    base_dir = os.path.join(base_dir, str(review_id))
    
    # write the abstracts to disk
    to_disk(base_dir, review_ids=[review_id], fields=fields)

    # now encode them
    lbl_d = pickle.load(open(os.path.join(base_dir, "labels.pickle")))

    # we encode the three main fields in separate spaces (multi-view)
    for field in fields:
        print "\n\n\n(write_review_to_disk) on field: %s..." % field
        
        dir_path = os.path.join(base_dir, field)
        out_path = os.path.join(dir_path, "encoded")
        out_f_name = "%s_encoded" % field
        tfidf2.encode_docs(dir_path, out_path, out_f_name, \
                    lbl_dict=lbl_d, clean_first=True, binary=True, \
                    min_word_count=3, bi_grams_too=True)

    # fetch the encoded status entry for this review, 
    # ***which we assume exists!***
    return base_dir
예제 #2
0
def fetch_and_encode(article_ids, out_dir, binary_features=False, 
                                    labels=None, fields = ["AB", "TI"], out_f_name = ""):
    '''
    First fetches from the web, then encodes them.
    '''
    # first, fetch the articles
    fetch_and_write_out(article_ids, out_dir, fields = fields)
    
    for field in fields:   
        print "encoding %s..." %field
        # now, clean and encode them
        out_for_field = os.path.join(out_dir, field)
        tfidf2.encode_docs(out_for_field, os.path.join(out_for_field, "encoded"), out_f_name + field, lbl_dict = labels)
    print "finito."
예제 #3
0
def fetch_and_encode(article_ids, out_dir, binary_features=False, 
                                    lbl_dict=None, fields = ["AB", "TI"], out_f_name = ""):
    '''
    First fetches from the web, then encodes them.
    
    lbl_dict is assumed to be a dictionary mapping ids to labels. If it is not provided
    documents will be encoded with a "?" as their label.
    '''
    # first, fetch the articles
    fetch_and_write_out(article_ids, out_dir, fields = fields)
    
    for field in fields:   
        print "encoding %s..." %field
        # now, clean and encode them
        out_for_field = os.path.join(out_dir, field)
        tfidf2.encode_docs(out_for_field, os.path.join(out_for_field, "encoded"), 
                            out_f_name + field, lbl_dict=lbl_dict)
    print "finito."
예제 #4
0
def fetch_and_encode(article_ids,
                     out_dir,
                     binary_features=False,
                     labels=None,
                     fields=["AB", "TI"],
                     out_f_name=""):
    '''
    First fetches from the web, then encodes them.
    '''
    # first, fetch the articles
    fetch_and_write_out(article_ids, out_dir, fields=fields)

    for field in fields:
        print "encoding %s..." % field
        # now, clean and encode them
        out_for_field = os.path.join(out_dir, field)
        tfidf2.encode_docs(out_for_field,
                           os.path.join(out_for_field, "encoded"),
                           out_f_name + field,
                           lbl_dict=labels)
    print "finito."