예제 #1
0
 def _getTopWords(self, k, stopword_removal=True):
     # get top words by counting the frequecy
     text_parser = TextParser(stopword_removal=stopword_removal)
     for element in self._event[self._element_type]:
         element = createElement(self._element_type, element)
         text = element.getText()
         if not text is None:
             text_parser.insertText(text)
     return text_parser.getTopWords(k)
예제 #2
0
 def _getTopWords(self, k, stopword_removal=False):
     # get top words by counting the frequecy
     text_parser = TextParser(stopword_removal=stopword_removal)
     for photo in self._event['photos']:
         p = Photo(photo)
         caption = p.getCaption()
         if not caption is None:
             text_parser.insertCaption(caption)
     return text_parser.getTopWords(k)
예제 #3
0
  def search(self, query, hits=10):
    urls = self._search_google(query)
    text_parser = TextParser.create()

    if 'who' in query.lower():
      names_aggregation = {}
      for url in urls:
        names = text_parser.find_names(self._get_page(url))
        for name in names:
          if names_aggregation.get(name, None):
            names_aggregation[name] += 1
          else:
            names_aggregation[name] = 1
      data = sorted(names_aggregation,
                    key=names_aggregation.get)[::-1][:10]
      return {
        'keyword': 'who',
        'data': data
      }

    elif 'where' in query.lower():
      places_aggregation = {}
      for url in urls:
        places = text_parser.find_names(self._get_page(url))
        for place in places:
          if places_aggregation.get(place, None):
            places_aggregation[place] += 1
          else:
            places_aggregation[place] = 1
      data = sorted(places_aggregation,
                    key=places_aggregation.get)[::-1][:10]
      return {
        'keyword': 'where',
        'data': data
      }

    else:
      hitword_aggregation = {}
      for url in urls:
        words = text_parser.find_hitwords(self._get_page(url, souped=True))
        for word in words:
          if hitword_aggregation.get(word, None):
            hitword_aggregation[word] += 1
          else:
            hitword_aggregation[word] = 1
      data = sorted(hitword_aggregation,
                    key=hitword_aggregation.get)[::-1][:10]
      return {
        'keyword': None,
        'data': data
      }
예제 #4
0
    def populate_ips_from_text_file(self):

        # Loading the initial file
        file_content = self.get_text_from_file(file_path=self.file_path)

        # Instantiating the text parser that grabs the ips
        text_parser = TextParser(raw_text=file_content)
        raw_ips = text_parser.list_of_ip_adresses_contained_in_raw_text()

        # Get env variables for the GeoIpService
        env_variable_getter = EnvVariableGetter()
        api_key = env_variable_getter.get_variable("api_key")
        api_url = env_variable_getter.get_variable("api_url")

        # Instantiate GeoIp service which is responsible for going out and getting ip geolocation
        geo_ip_service = GeoIpService(key=api_key, url=api_url)

        # Get the Geo Ip info using the geo_ip_service, generate ip_models from
        # the response date and store them in the list and the dict for further filtering
        for ip in raw_ips:
            geo_ip_resonse = geo_ip_service.get_geo_ip_info_for_ip(
                ip_address=ip, format="json")
            self.generate_and_store_ip_model(ip=ip, data=geo_ip_resonse.json())
예제 #5
0
	def __init__(self, debug=False, log=None, data_dir="data", clf_type=None):
		if clf_type is None:
			self.__print__('ERR', 'unable to create SentiAnalys obj: no classificator type')
			sys.exit(1)

		TextParser.__init__(self, debug, log, data_dir)
		self.senti_class = ['нейтральная', 'позитивная', 'негативная']

		if clf_type == 0:
			clf_fname = 'log_clf.model'
		elif clf_type == 1:
			clf_fname = 'xgboost_clf.model'
		else:
			self.__print__('ERR', 'unable to create SentiAnalys obj: incorrect classificator type {}'.format(clf_type))
			sys.exit(1)

		self.clf_type = clf_type
		try:
			clf_f = open(clf_fname, 'r')
			self.clf = pickle.load(clf_f)
			# TODO: check clf type
			clf_f.close()
		except Exception as e:
			self.__print__('ERR', "unable to init SentiAnalys obj: {}".format(e))
예제 #6
0
        def PhotoDistanceByCaption(photo1, photo2):

            p1 = Photo(photo1)
            p2 = Photo(photo2)
            cap1 = p1.getCaption()
            cap2 = p2.getCaption()
            cp1 = TextParser(True)
            cp1.insertCaption(cap1)
            cp2 = TextParser(True)
            cp2.insertCaption(cap2)
            word_list1 = cp1.getTopWords(-1)
            word_list2 = cp2.getTopWords(-1)
            if len(word_list1) == 0 or len(word_list2) == 0:
                # unable to compare
                return None
            word_dict1 = {}
            for word, freq in word_list1:
                word_dict1[word] = freq
            word_dict2 = {}
            for word, freq in word_list2:
                word_dict2[word] = freq
            return kldiv(word_dict1, word_dict2)
예제 #7
0
        def ElementDistanceByText(element1, element2):

            p1 = createElement(self._element_type, element1)
            p2 = createElement(self._element_type, element2)
            cap1 = p1.getText()
            cap2 = p2.getText()
            cp1 = TextParser(True)
            cp1.insertText(cap1)
            cp2 = TextParser(True)
            cp2.insertText(cap2)
            word_list1 = cp1.getTopWords(-1)
            word_list2 = cp2.getTopWords(-1)
            if len(word_list1) == 0 or len(word_list2) == 0:
                # unable to compare
                return None
            word_dict1 = {}
            for word, freq in word_list1:
                word_dict1[word] = freq
            word_dict2 = {}
            for word, freq in word_list2:
                word_dict2[word] = freq
            return kldiv(word_dict1, word_dict2)
	def __init__(self, batch_size=50, debug=False, log=None, data_dir="data"):
		TextParser.__init__(self, debug, log, data_dir)

		self.db_cn = DBConnector()
		self.__select_news_agent_info__()
		self.batch_size = batch_size
예제 #9
0
from text_parser import TextParser
import time

path = '/home/tim/Dropbox/Notes/journal.txt'

tp = TextParser(path)
tp.prepare_batches(128, 32, 4)

print len(tp.vocab.keys())

for i in range(10):
    tp.switch_split('cv')
    print tp.get_next_feed_dict('a', 'b')
    tp.switch_split('train')
    print tp.get_next_feed_dict('a', 'b')
    tp.switch_split('cv')
    print tp.get_next_feed_dict('a', 'b')
    tp.switch_split('train')
    print tp.get_next_feed_dict('a', 'b')

for i in range(1000000):
    tp.get_next_feed_dict('a', 'b')
예제 #10
0
 def __init__(self, debug=False, log=None, data_dir="data"):
     TextParser.__init__(self, debug, log, data_dir)
예제 #11
0
def tokenize(text):
    # print text
    tokenizer = TextParser(stopword_file='stopwords.txt')
    # tokens = tokenizer.parse_words(text)
    tokens = tokenizer.parse_words(text, stem=True)
    return tokens
예제 #12
0
    def __init__(self, batch_size=50, debug=False, log=None, data_dir="data"):
        TextParser.__init__(self, debug, log, data_dir)

        self.db_cn = DBConnector()
        self.__select_news_agent_info__()
        self.batch_size = batch_size
	def __init__(self, debug=False, log=None, data_dir="data"):
		TextParser.__init__(self, debug, log, data_dir)
예제 #14
0
    def clean_words(self, word_parser):
        self.words = word_parser.parse_words(self.raw_message)

    # clean words for one message (NOT efficient)
    def clean_words_pos(self, word_parser, preserve_tags, ark_run_cmd):
        self.words = word_parser.parse_words_by_ark_nlp(self.raw_message, \
                                                preserve_types, ark_run_cmd)

    def set_clean_words(self, clean_words):
        # self.words = list(set(clean_words))
        self.words = clean_words

    def remove_stopwords(self, stopword_set):
        trimed_words = []
        for w in self.words:
            if w not in stopword_set:
                trimed_words.append(w)
        self.words = trimed_words


if __name__ == '__main__':
    wp = TextParser(min_length=2)
    m = Message(
        'hello, This is@ went octopi just a test for 12you!. Try it http://')
    preserve_types = ['V', 'N', '^']
    ark_run_cmd = 'java -XX:ParallelGCThreads=2 -Xmx2G -jar /Users/chao/Dropbox/code/lib/ark-tweet-nlp-0.3.2.jar'
    m.clean_words(wp)
    print m.words
    m.clean_words_pos(wp, set(['S', 'N', '^']), ark_run_cmd)
    print m.words