def get_me_some_test_samples(file_path, prob_to_take=1.0): """ :param file_path: the path for the file where we load the data from :param prob_to_take: the probability to take ceratin news as a test sample :return: returns X_test and Y_test X_test is a list of lists, where each list in the list represents a list of feautres Y_test represents the class id for each list of features in X_test """ cat_to_ind, ind_to_cat = Utility.load_categories() vectorizer = get_persisted_vectorizer(vectorizer_file_path) X_test = [] Y_test = [] for line in open(file_path): parts = line.decode(encoding="utf-8").strip().split("\t") category = parts[0] text = parts[1] if random.random() < prob_to_take: X_test.append(text) Y_test.append(cat_to_ind[category]) X_test = vectorizer.transform(X_test) return X_test, Y_test
def create_model(model_file_name, vectorizer_file_name, training_percentage=0.93, max_iter=300, print_logs=False): """ creating the model for logistic regression and storing it for later use :param model_file_name: the name of the file where the model will be stored :param vectorizer_file_name: the name of file where the vectorizer will be stored :param training_percentage: how much of the training set should we use :param max_iter: the maximum number of iterations for training :param print_logs: whether we should print the logs (such as time needed for training) or not :return: returns the model and the vectorizer """ begin = time.time() t0 = time.time() # for a given category we return it's id cat_to_ind = {} # for a given id we return the category associated with that id ind_to_cat = [] curr_cat_ind = 0 X_train = [] Y_train = [] cat_to_ind, ind_to_cat = Utility.load_categories() for line in open(Utility.training_file_path): parts = line.decode(encoding="utf-8").strip().split("\t") category = parts[0] text = parts[1] item = [text, cat_to_ind[category]] if random.random() < training_percentage: X_train.append(item[0]) Y_train.append(item[1]) if print_logs: print "Read the dataset for %d seconds" % (time.time() - t0) t0 = time.time() vectorizer = TfidfVectorizer(encoding="utf-8", lowercase=True, stop_words=Utility.stop_words) X_train = vectorizer.fit_transform(X_train) if print_logs: print "Vectorized the train and test set for %d seconds" % (time.time() - t0) classifier = linear_model.LogisticRegression(max_iter=max_iter) classifier.predi t0 = time.time() classifier.fit(X_train, Y_train) if print_logs: print "Trained the model for %d seconds" % (time.time() - t0) t0 = time.time() file_to_write = open(model_file_name, "w") Pickler(file_to_write).dump(classifier) file_to_write.close() file_to_write = open(vectorizer_file_name, "w") Pickler(file_to_write).dump(vectorizer) file_to_write.close() if print_logs: print "Dumped the items for %d seconds." % (time.time() - t0) if print_logs: print "Total time: %d seconds" % (time.time() - begin) return classifier, vectorizer
def parse_rss_feed(rss_feed_url, stop_after=None): feedback = '' #model = get_persisted_model() _,ind_to_cat = Utility.load_categories() #vectorizer = get_persisted_vectorizer() #feedback += 'In parse rss feed!!!\n' try: #opening the url and reading the content #feedback += 'trying to open the content %s\n' % rss_feed_url c = urlopen(rss_feed_url) #feedback += 'Opened the content\n' content = c.read() #feedback += 'Read the content' soup = BeautifulSoup(content) #feedback += 'Opened the content\n' ind = 0 tuples = [] for item in soup.findAll('item'): ind += 1 #feedback += 'Item num: %d\n' % ind if (stop_after is not None) and ind > stop_after: break try: title = item.find('title').string link_url = item.find('link').string description_object = item.find('description') description = '' for p in BeautifulSoup(description_object.string).findAll('p'): description += '%s ' % p.text feedback += 'title: %s\n' % title.strip() feedback += 'link_url: %s\n' % link_url feedback += 'description: %s\n' % description pub_date = item.find('pubdate') if pub_date is not None: pub_date = pub_date.string datetime_obj = parse(pub_date, ignoretz=True) feedback += 'pub_date: %s\n' % (datetime_obj.strftime('%B %d %Y %H:%M')) date_milli = (datetime_obj - epoch).total_seconds() * 1000.0 feedback += 'milli: %f\n' % date_milli else: feedback += 'pub_date: None\n' text = '' img_url = None #we try to fetch the photo url directly from the rss feed, if not possible we will try later again if (item.description is not None) and (item.description.string is not None): img_obj = BeautifulSoup(item.description.string).find('img') if img_obj is not None: img_url = img_obj['src'] elif item.description is not None: img_obj = item.description.find('img') if img_obj is not None: img_url = img_obj['src'] #here we get the content of the news link_content = urlopen(link_url).read() innerSoup = BeautifulSoup(link_content) for script in innerSoup(['script', 'style']): script.extract() if rss_feed_url in Utility.fetch_text_specifications: specifications = Utility.fetch_text_specifications[rss_feed_url] if isinstance(specifications, list): #we take the paragraphs start = Utility.fetch_text_specifications[rss_feed_url][0] end = len(innerSoup.findAll('p')) if len(Utility.fetch_text_specifications[rss_feed_url]) > 1: end = Utility.fetch_text_specifications[rss_feed_url][1] for p in innerSoup.findAll('p')[start:end]: text += p.text else: tag_type = specifications['tag_type'] attr_type = specifications['attribute_type'] attr_value = specifications['attribute_value'] #feedback += 'tag_type: %s attr_type: %s attr_value: %s\n' % (tag_type, attr_type, attr_value) sections = innerSoup.findAll(tag_type, {attr_type: attr_value}) #feedback += 'tags size: %d\n' % len(sections) #if we need to go deeper if 'nested_tag_type' in specifications: nested_tag_type = specifications['nested_tag_type'] nested_attr_type = specifications['nested_attribute_type'] nested_attr_value = specifications['nested_attribute_value'] limit = specifications.get('limit', 1000) recursive = specifications.get('recursive', True) new_sections = [] for section in sections: new_sections.extend(section.findAll(nested_tag_type,{ nested_attr_type: nested_attr_value}, limit=limit, recursive=recursive)) sections = new_sections for section in sections: feedback += 'tag name: %s\n' % section.name text += section.text #we are trying to get the image from the news if img_url is None: imgs = innerSoup.findAll('img') img_url = '' if imgs is not None and len(imgs) > 0: img_url = imgs[0]['src'] feedback += 'img_url: %s\n' % img_url feedback += 'text: %s\n' % text.strip() #X_train = vectorizer.transform([text]) #cat_ind = model.predict(X_train) #feedback +='CATEGORY: %s' % ind_to_cat[cat_ind[0]] feedback += '------------------------------\n' tuples.append((title,link_url, Utility.getWords(text))) except Exception as inst: feedback += 'Inner Exception type: %s\n' % str(type(inst)) feedback += 'Inner Exception message: %s\n' % inst.message feedback += 'Numer of posts: %d\n' % ind #go from front front = -1 for ind in xrange(100): num_documents = 0 everywhere = True for i in xrange(1, len(tuples)): if not tuples[i-1][2][ind] == tuples[i][2][ind]: everywhere = False break if not everywhere: front = 1 + ind break return feedback except Exception as inst: feedback += 'Exception type: %s\n' % type(inst) feedback += 'Exception message: %s\n' % inst.message #if there is an exception, we return and empty list of news posts return feedback