def timed_dataload(loader, data, args, embedder, values, labels): # use separate counter to account for invalid input along the way counter = 0 for text, sentiment in data: try: if counter % 10000 == 0: print("Loading at {}".format(counter)) # normalize and tokenize if necessary if args.has_key("normalize"): text_normalized = data_utils.normalize(text, **args["normalize"]) else: text_normalized = text # tokenize if args.get("load", {}).get("form", None) == "hanzi": tokens = data_utils.tokenize_hanzi(text_normalized) elif args.get("load", {}).get("form", None) == "arabic": text_stripped = loader.twitter_strip(text_normalized) tokens = loader.tokenize_arabic(text_stripped) else: tokens = data_utils.tokenize(text_normalized) # choose embedding type vector = None if args["embed"]["type"] == "concatenated": vector = embedder.embed_words_into_vectors_concatenated(tokens, **self.args["embed"]) elif args["embed"]["type"] == "averaged": vector = embedder.embed_words_into_vectors_averaged(tokens) else: pass # data labeled by sentiment score (thread-safe with lock) if vector is not None: values.append(vector) labels.append(sentiment) counter += 1 except TextTooShortException as e: pass
def timed_dataload(data, args, values, labels): # use separate counter to account for invalid input along the way counter = 0 for text,sentiment in data: try: if (counter % 10000 == 0): print("Loading at {}".format(counter)) # normalize and tokenize if necessary if args.has_key('normalize'): text_normalized = data_utils.normalize(text, **args['normalize']) else: text_normalized = text # tokenize if data_args.get('load', {}).get('form', None) == 'hanzi': tokens = data_utils.tokenize_hanzi(text_normalized) else: tokens = data_utils.tokenize(text_normalized) # choose embedding type vector = None if args['embed']['type'] == 'concatenated': vector = embedder.embed_words_into_vectors_concatenated(tokens, **self.args['embed']) elif args['embed']['type'] == 'averaged': vector = embedder.embed_words_into_vectors_averaged(tokens) else: pass # data labeled by sentiment score (thread-safe with lock) if vector is not None: values.append(vector) labels.append(sentiment) counter += 1 except TextTooShortException as e: pass
def run(self): # process valid entries if self.data[0]: # process valid data text, sentiment = self.data if text: try: # normalize and tokenize if necessary if self.args.has_key('normalize'): text_normalized = data_utils.normalize( text, **self.args['normalize']) else: text_normalized = text # tokenize tokens = data_utils.tokenize(text_normalized) # choose embedding type vector = None if self.args['embed']['type'] == 'concatenated': vector = embedder.embed_words_into_vectors_concatenated( tokens, **self.args['embed']) elif self.args['embed']['type'] == 'averaged': vector = embedder.embed_words_into_vectors_averaged( tokens) else: pass # data labeled by sentiment score (thread-safe with lock) if vector is not None: threadLock.acquire() values.append(vector) labels.append(sentiment) threadLock.release() except TextTooShortException as e: pass
def run(self): # process valid entries if self.data[0]: # process valid data text, sentiment = self.data if text: try: # normalize and tokenize if necessary if self.args.has_key('normalize'): text_normalized = data_utils.normalize(text, **self.args['normalize']) else: text_normalized = text # tokenize tokens = data_utils.tokenize(text_normalized) # choose embedding type vector = None if self.args['embed']['type'] == 'concatenated': vector = embedder.embed_words_into_vectors_concatenated(tokens, **self.args['embed']) elif self.args['embed']['type'] == 'averaged': vector = embedder.embed_words_into_vectors_averaged(tokens) else: pass # data labeled by sentiment score (thread-safe with lock) if vector is not None: threadLock.acquire() values.append(vector) labels.append(sentiment) threadLock.release() except TextTooShortException as e: pass
def normalize_imdb(txt): return data_utils.normalize(txt, encoding=None)
def normalize_tweet(txt): return data_utils.normalize(txt, min_length=70, max_length=150)