def get_bag_of_words(self, rawtext): try: soup = BeautifulStoneSoup(rawtext).findAll(text=True) except UnicodeEncodeError, e: mylog.info("Failed parsing the rawtext file") mylog.info("Not including it in arff file") return None
def __init__(self, articledir="./articles", skipdb=False): if not skipdb: try: self.mydb = mydb.MyDb(config.get("db", "db")) self.mydb.get_cursor().execute('''create table bagofwords (pmcid text, words text)''') self.mydb.get_cursor().execute('''create unique index pmcid_bagofwords_idx on bagofwords (pmcid)''') except: mylog.info("Not able to create database bagofwords table or index, maybe because it already exists.") self.mydb.free() self.articledir = articledir
def get_freq_dist_from_dataset_tuples(self, dataset_file_tuples): freq_dist = FreqDist() number_of_documents = len(dataset_file_tuples) docid_count = 0 for (docid, local_location) in dataset_file_tuples: docid_count += 1 mylog.info("Getting doctext: %-5s of %s, docid %-8s" %(docid_count, number_of_documents, docid)) doctext = self.get_doctext(docid, local_location) if doctext: add_words_to_freq_dist(freq_dist, doctext) return(freq_dist)
def get_doctext(self, docid, local_location): doctext = self.doctext_from_db(docid) if doctext: is_processed_for_bigrams = "_" in doctext if is_processed_for_bigrams: mylog.info("Bag of words and bigrams already in db") return(doctext) if not local_location: mylog.info("No db or local file for %s, so skipping" % (docid)) return("") mylog.info("COMPUTING bag of words and bigrams and writing to db") rawtext = open(self.articledir + "/" + local_location).read() doctext = self.get_bag_of_words(rawtext) self.write_words_to_db(docid, doctext) return doctext
def get_feature_list(fd_all_words, min_number_instances): feature_list_prelim = [feature for feature in fd_all_words.samples() if (fd_all_words[feature] >= min_number_instances)] mylog.info("Prelim feature list is " + str(len(feature_list_prelim)) + " features long.") feature_list = [feature for feature in feature_list_prelim if not in_stopwords(feature)] return(feature_list)