def _getTopWords(self, k, stopword_removal=True): # get top words by counting the frequecy text_parser = TextParser(stopword_removal=stopword_removal) for element in self._event[self._element_type]: element = createElement(self._element_type, element) text = element.getText() if not text is None: text_parser.insertText(text) return text_parser.getTopWords(k)
def _getTopWords(self, k, stopword_removal=False): # get top words by counting the frequecy text_parser = TextParser(stopword_removal=stopword_removal) for photo in self._event['photos']: p = Photo(photo) caption = p.getCaption() if not caption is None: text_parser.insertCaption(caption) return text_parser.getTopWords(k)
def search(self, query, hits=10): urls = self._search_google(query) text_parser = TextParser.create() if 'who' in query.lower(): names_aggregation = {} for url in urls: names = text_parser.find_names(self._get_page(url)) for name in names: if names_aggregation.get(name, None): names_aggregation[name] += 1 else: names_aggregation[name] = 1 data = sorted(names_aggregation, key=names_aggregation.get)[::-1][:10] return { 'keyword': 'who', 'data': data } elif 'where' in query.lower(): places_aggregation = {} for url in urls: places = text_parser.find_names(self._get_page(url)) for place in places: if places_aggregation.get(place, None): places_aggregation[place] += 1 else: places_aggregation[place] = 1 data = sorted(places_aggregation, key=places_aggregation.get)[::-1][:10] return { 'keyword': 'where', 'data': data } else: hitword_aggregation = {} for url in urls: words = text_parser.find_hitwords(self._get_page(url, souped=True)) for word in words: if hitword_aggregation.get(word, None): hitword_aggregation[word] += 1 else: hitword_aggregation[word] = 1 data = sorted(hitword_aggregation, key=hitword_aggregation.get)[::-1][:10] return { 'keyword': None, 'data': data }
def populate_ips_from_text_file(self): # Loading the initial file file_content = self.get_text_from_file(file_path=self.file_path) # Instantiating the text parser that grabs the ips text_parser = TextParser(raw_text=file_content) raw_ips = text_parser.list_of_ip_adresses_contained_in_raw_text() # Get env variables for the GeoIpService env_variable_getter = EnvVariableGetter() api_key = env_variable_getter.get_variable("api_key") api_url = env_variable_getter.get_variable("api_url") # Instantiate GeoIp service which is responsible for going out and getting ip geolocation geo_ip_service = GeoIpService(key=api_key, url=api_url) # Get the Geo Ip info using the geo_ip_service, generate ip_models from # the response date and store them in the list and the dict for further filtering for ip in raw_ips: geo_ip_resonse = geo_ip_service.get_geo_ip_info_for_ip( ip_address=ip, format="json") self.generate_and_store_ip_model(ip=ip, data=geo_ip_resonse.json())
def __init__(self, debug=False, log=None, data_dir="data", clf_type=None): if clf_type is None: self.__print__('ERR', 'unable to create SentiAnalys obj: no classificator type') sys.exit(1) TextParser.__init__(self, debug, log, data_dir) self.senti_class = ['нейтральная', 'позитивная', 'негативная'] if clf_type == 0: clf_fname = 'log_clf.model' elif clf_type == 1: clf_fname = 'xgboost_clf.model' else: self.__print__('ERR', 'unable to create SentiAnalys obj: incorrect classificator type {}'.format(clf_type)) sys.exit(1) self.clf_type = clf_type try: clf_f = open(clf_fname, 'r') self.clf = pickle.load(clf_f) # TODO: check clf type clf_f.close() except Exception as e: self.__print__('ERR', "unable to init SentiAnalys obj: {}".format(e))
def PhotoDistanceByCaption(photo1, photo2): p1 = Photo(photo1) p2 = Photo(photo2) cap1 = p1.getCaption() cap2 = p2.getCaption() cp1 = TextParser(True) cp1.insertCaption(cap1) cp2 = TextParser(True) cp2.insertCaption(cap2) word_list1 = cp1.getTopWords(-1) word_list2 = cp2.getTopWords(-1) if len(word_list1) == 0 or len(word_list2) == 0: # unable to compare return None word_dict1 = {} for word, freq in word_list1: word_dict1[word] = freq word_dict2 = {} for word, freq in word_list2: word_dict2[word] = freq return kldiv(word_dict1, word_dict2)
def ElementDistanceByText(element1, element2): p1 = createElement(self._element_type, element1) p2 = createElement(self._element_type, element2) cap1 = p1.getText() cap2 = p2.getText() cp1 = TextParser(True) cp1.insertText(cap1) cp2 = TextParser(True) cp2.insertText(cap2) word_list1 = cp1.getTopWords(-1) word_list2 = cp2.getTopWords(-1) if len(word_list1) == 0 or len(word_list2) == 0: # unable to compare return None word_dict1 = {} for word, freq in word_list1: word_dict1[word] = freq word_dict2 = {} for word, freq in word_list2: word_dict2[word] = freq return kldiv(word_dict1, word_dict2)
def __init__(self, batch_size=50, debug=False, log=None, data_dir="data"): TextParser.__init__(self, debug, log, data_dir) self.db_cn = DBConnector() self.__select_news_agent_info__() self.batch_size = batch_size
from text_parser import TextParser import time path = '/home/tim/Dropbox/Notes/journal.txt' tp = TextParser(path) tp.prepare_batches(128, 32, 4) print len(tp.vocab.keys()) for i in range(10): tp.switch_split('cv') print tp.get_next_feed_dict('a', 'b') tp.switch_split('train') print tp.get_next_feed_dict('a', 'b') tp.switch_split('cv') print tp.get_next_feed_dict('a', 'b') tp.switch_split('train') print tp.get_next_feed_dict('a', 'b') for i in range(1000000): tp.get_next_feed_dict('a', 'b')
def __init__(self, debug=False, log=None, data_dir="data"): TextParser.__init__(self, debug, log, data_dir)
def tokenize(text): # print text tokenizer = TextParser(stopword_file='stopwords.txt') # tokens = tokenizer.parse_words(text) tokens = tokenizer.parse_words(text, stem=True) return tokens
def clean_words(self, word_parser): self.words = word_parser.parse_words(self.raw_message) # clean words for one message (NOT efficient) def clean_words_pos(self, word_parser, preserve_tags, ark_run_cmd): self.words = word_parser.parse_words_by_ark_nlp(self.raw_message, \ preserve_types, ark_run_cmd) def set_clean_words(self, clean_words): # self.words = list(set(clean_words)) self.words = clean_words def remove_stopwords(self, stopword_set): trimed_words = [] for w in self.words: if w not in stopword_set: trimed_words.append(w) self.words = trimed_words if __name__ == '__main__': wp = TextParser(min_length=2) m = Message( 'hello, This is@ went octopi just a test for 12you!. Try it http://') preserve_types = ['V', 'N', '^'] ark_run_cmd = 'java -XX:ParallelGCThreads=2 -Xmx2G -jar /Users/chao/Dropbox/code/lib/ark-tweet-nlp-0.3.2.jar' m.clean_words(wp) print m.words m.clean_words_pos(wp, set(['S', 'N', '^']), ark_run_cmd) print m.words