def render(item, page_number): if isinstance(item, LTPage) or isinstance(item, LTTextBox): for child in item: render(child, page_number) elif isinstance(item, LTTextLine): child_str = '' for child in item: if isinstance(child, (LTChar, LTAnno)): child_str += child.get_text() child_str = ' '.join(child_str.split()).strip() if child_str: row = (page_number, item.bbox[0], item.bbox[1], item.bbox[2], item.bbox[3], child_str ) # bbox == (x1, y1, x2, y2) # HACK #check if it is outline page if ('contents' in child_str.lower()): #print("found", child_str.lower()) self.outline = True self.outline = 1 if self.outline: if ('agricultur' in child_str.lower()) or ('health' in child_str.lower())\ or ('social' in child_str.lower()) or ('schedule' in child_str.lower())\ or ('labour' in child_str.lower()) or ('revenue' in child_str.lower())\ or ('amendment' in child_str.lower()) or ('cancellation' in child_str.lower())\ or ('extension' in child_str.lower()) or ('correction' in child_str.lower()) \ or ('trade' in child_str.lower()) or ('industry' in child_str.lower())\ or ('specification' in child_str.lower()) or ('customs' in child_str.lower())\ or ('renewal' in child_str.lower()) or ('agreement' in child_str.lower())\ or ('education' in child_str.lower()) or ('regulation' in child_str.lower())\ or ('registration' in child_str.lower()) or ('nurse' in child_str.lower())\ or ('auxiliary' in child_str.lower()) or ('student' in child_str.lower())\ or ('benefit' in child_str.lower()) or ('act' in child_str.lower()): #self.interesting_text.append(child_str.lower().rsplit()) # split by words # strip '...' entry = child_str.lower().replace(".", "").strip() language, ratio = detect_language(entry) if language == 'english': self.interesting_text.append(entry) elif ratio['english'] > 0: # some Eng words self.aux_text.append(entry) # end HACK self.rows.append(row) for child in item: render(child, page_number) return
def render(item, page_number): if isinstance(item, LTPage) or isinstance(item, LTTextBox): for child in item: render(child, page_number) elif isinstance(item, LTTextLine): child_str = '' for child in item: if isinstance(child, (LTChar, LTAnno)): child_str += child.get_text() child_str = ' '.join(child_str.split()).strip() if child_str: row = (page_number, item.bbox[0], item.bbox[1], item.bbox[2], item.bbox[3], child_str) # bbox == (x1, y1, x2, y2) # HACK #check if it is outline page if ('contents' in child_str.lower()): #print("found", child_str.lower()) self.outline = True self.outline = 1 if self.outline: if ('agricultur' in child_str.lower()) or ('health' in child_str.lower())\ or ('social' in child_str.lower()) or ('schedule' in child_str.lower())\ or ('labour' in child_str.lower()) or ('revenue' in child_str.lower())\ or ('amendment' in child_str.lower()) or ('cancellation' in child_str.lower())\ or ('extension' in child_str.lower()) or ('correction' in child_str.lower()) \ or ('trade' in child_str.lower()) or ('industry' in child_str.lower())\ or ('specification' in child_str.lower()) or ('customs' in child_str.lower())\ or ('renewal' in child_str.lower()) or ('agreement' in child_str.lower())\ or ('education' in child_str.lower()) or ('regulation' in child_str.lower())\ or ('registration' in child_str.lower()) or ('nurse' in child_str.lower())\ or ('auxiliary' in child_str.lower()) or ('student' in child_str.lower())\ or ('benefit' in child_str.lower()) or ('act' in child_str.lower()): #self.interesting_text.append(child_str.lower().rsplit()) # split by words # strip '...' entry = child_str.lower().replace(".","").strip() language, ratio = detect_language(entry) if language == 'english': self.interesting_text.append(entry) elif ratio['english'] > 0: # some Eng words self.aux_text.append(entry) # end HACK self.rows.append(row) for child in item: render(child, page_number) return
def tokenize_document(docpair, use_nltk=True): print 'working on doc {}'.format(docpair[0]) if not use_nltk: if FILTER_ENGLISH: return [ x.lower_.encode('ascii', errors='ignore') for x in nlp(docpair[1]) if detect_language(x) == 'english' ] return [ x.lower_.encode('ascii', errors='ignore') for x in nlp(docpair[1]) ] else: if FILTER_ENGLISH: return [ x.encode('ascii', errors='ignore').lower() for x in word_tokenize(docpair[1]) if detect_language(x) == 'english' ] return [ x.encode('ascii', errors='ignore').lower() for x in word_tokenize(docpair[1]) ]
def track_generator(languages, emotion, seeds, year_range, recreate=False): station = Pandora_Station.get_or_create(languages, emotion, seeds, recreate) for track in station.get_playlist(): try: artist, song_name = track['artistName'], track['songName'] except KeyError: continue detected_language = detect_language(song_name) if detected_language in languages: rdio_track = Rdio_Track.search(artist, song_name) if rdio_track is not None: release_year = rdio_track.get_release_date().year if release_year in year_range: print u"Found {}".format(rdio_track) yield rdio_track else: print u"Release year {} outside range for {}".format(release_year, song_name) else: print u"No rdio version of {}".format(song_name) else: print u"Wrong language {} for {}".format(detected_language, song_name) if not recreate: for rdio_track in track_generator(languages, emotion, seeds, year_range, recreate=True): yield rdio_track
def tokenize_document(docpair, use_nltk=True): print 'working on doc {}'.format(docpair[0]) if not use_nltk: if FILTER_ENGLISH: return [x.lower_.encode('ascii',errors='ignore') for x in nlp(docpair[1]) if detect_language(x) == 'english'] return [x.lower_.encode('ascii',errors='ignore') for x in nlp(docpair[1])] else: if FILTER_ENGLISH: return [x.encode('ascii',errors='ignore').lower() for x in word_tokenize(docpair[1]) if detect_language(x) == 'english'] return [x.encode('ascii',errors='ignore').lower() for x in word_tokenize(docpair[1])]
def get_reviews_data(partitions_to_use, pickle_base_name): """ Gets loaded json data in pickles and returns fields of interest """ data = load_partitions(partitions_to_use, pickle_base_name) review_texts = [] useful_votes = [] funny_votes = [] cool_votes = [] review_stars = [] for review in data: review_texts.append(review['text']) useful_votes.append(review['votes']['useful' cool_votes.append(review['votes']['cool']) funny_votes.append(review['votes']['funny']) review_stars.append(review['stars']) return review_texts, useful_votes, funny_votes, cool_votes, review_stars def give_balanced_classes(reviews, funny_votes): """ From all the reviews and votes given, partitions the data into two classes: funny reviews and not funny reviews. All the funny reviews found are returned. The method is assuming majority of not funny votes. The same number of not funny reviews are returned, randomly selected. Returned data is a shuffled balanced set of funny and not funny reviews. """ # We will consider a review to be funny if it has 3 or more funny votes. # Not funny reviews have 0 votes. VOTES_THRESHOLD = 3 not_funny_reviews_indices = [] # Find all the funny reviews we can final_reviews = [] final_labels = [] for i, review in enumerate(reviews): if funny_votes[i] >= VOTES_THRESHOLD: final_reviews.append(review) final_labels.append(1) elif funny_votes[i] == 0: not_funny_reviews_indices.append(i) # We want balanced classes so take same number np.random.shuffle(not_funny_reviews_indices) num_funny_reviews = len(final_reviews) for i in range(num_funny_reviews): final_reviews.append(reviews[not_funny_reviews_indices[i]]) final_labels.append(0) # Shuffle final reviews and labels combined_lists = zip(final_reviews, final_labels) np.random.shuffle(combined_lists) final_reviews[:], final_labels[:] = zip(*combined_lists) print "Returning %d funny reviews and a total of %d reviews" % (num_funny_reviews, len(final_reviews)) return (final_reviews, final_labels) def create_data_sets(partition_list=range(1,100), pickle_base_name=DEFAULT_REVIEWS_PICKLE + '.'): """ Creates a 50% - 25% - 25% train/validation/test partition of the classification problem. Classes are balanced. It reads the list of partitions saved in pickles. Resulting data sets are saved as python pickles. """ load_partitions(partition_list, pickle_base_name) reviews, _, funny_votes, _, _ = get_reviews_data(partition_list, pickle_base_name) reviews, labels = give_balanced_classes(reviews, funny_votes) N = len(reviews) train_reviews = reviews[:N/2] train_labels = labels[:N/2] dev_reviews = reviews[N/2:3*N/4] dev_labels = labels[N/2:3*N/4] test_reviews = reviews[3*N/4:] test_labels = labels[3*N/4:] pickle.dump([train_reviews, train_labels], open("TrainSet_" + str(N), "wb"), pickle.HIGHEST_PROTOCOL) pickle.dump([dev_reviews, dev_labels], open("DevSet_" + str(N), "wb"), pickle.HIGHEST_PROTOCOL) pickle.dump([test_reviews, test_labels], open("TestSet_" + str(N), "wb"), pickle.HIGHEST_PROTOCOL) def accept_only_english(json_review): # Short texts are hard to classify in any language, so they will be accepted if len(json_review['text']) <= 150: return True else: return language.detect_language(json_review['text']) == 'english'