示例#1
0
 def get_bag_of_words(self, rawtext):
     try:
         soup = BeautifulStoneSoup(rawtext).findAll(text=True)
     except UnicodeEncodeError, e:
         mylog.info("Failed parsing the rawtext file")
         mylog.info("Not including it in arff file")
         return None
示例#2
0
 def __init__(self, articledir="./articles", skipdb=False):
     if not skipdb:
         try:
             self.mydb = mydb.MyDb(config.get("db", "db"))
             self.mydb.get_cursor().execute('''create table bagofwords (pmcid text, words text)''')
             self.mydb.get_cursor().execute('''create unique index pmcid_bagofwords_idx on bagofwords (pmcid)''')           
         except:
             mylog.info("Not able to create database bagofwords table or index, maybe because it already exists.")
         self.mydb.free()    
     
     self.articledir = articledir
示例#3
0
 def get_freq_dist_from_dataset_tuples(self, dataset_file_tuples):
     freq_dist = FreqDist()
     number_of_documents = len(dataset_file_tuples)
     docid_count = 0
     for (docid, local_location) in dataset_file_tuples:
         docid_count += 1
         mylog.info("Getting doctext: %-5s of %s, docid %-8s" %(docid_count, number_of_documents, docid))
         doctext = self.get_doctext(docid, local_location)
         if doctext:
             add_words_to_freq_dist(freq_dist, doctext)
     return(freq_dist)
示例#4
0
 def get_doctext(self, docid, local_location):
     doctext = self.doctext_from_db(docid)
     if doctext:
         is_processed_for_bigrams = "_" in doctext
         if is_processed_for_bigrams:
             mylog.info("Bag of words and bigrams already in db")
             return(doctext)
     if not local_location:
         mylog.info("No db or local file for %s, so skipping" % (docid))
         return("")
     mylog.info("COMPUTING bag of words and bigrams and writing to db")
     rawtext = open(self.articledir + "/" + local_location).read()
     doctext = self.get_bag_of_words(rawtext)
     self.write_words_to_db(docid, doctext)
     return doctext
示例#5
0
def get_feature_list(fd_all_words, min_number_instances):
    feature_list_prelim = [feature for feature in fd_all_words.samples() if (fd_all_words[feature] >= min_number_instances)]
    mylog.info("Prelim feature list is " + str(len(feature_list_prelim)) + " features long.")       
    feature_list = [feature for feature in feature_list_prelim if not in_stopwords(feature)]
    return(feature_list)