예제 #1
0
def timed_dataload(loader, data, args, embedder, values, labels):

    # use separate counter to account for invalid input along the way
    counter = 0

    for text, sentiment in data:

        try:
            if counter % 10000 == 0:
                print("Loading at {}".format(counter))

            # normalize and tokenize if necessary
            if args.has_key("normalize"):
                text_normalized = data_utils.normalize(text, **args["normalize"])
            else:
                text_normalized = text

            # tokenize
            if args.get("load", {}).get("form", None) == "hanzi":
                tokens = data_utils.tokenize_hanzi(text_normalized)
            elif args.get("load", {}).get("form", None) == "arabic":
                text_stripped = loader.twitter_strip(text_normalized)
                tokens = loader.tokenize_arabic(text_stripped)
            else:
                tokens = data_utils.tokenize(text_normalized)

            # choose embedding type
            vector = None
            if args["embed"]["type"] == "concatenated":
                vector = embedder.embed_words_into_vectors_concatenated(tokens, **self.args["embed"])
            elif args["embed"]["type"] == "averaged":
                vector = embedder.embed_words_into_vectors_averaged(tokens)
            else:
                pass

            # data labeled by sentiment score (thread-safe with lock)
            if vector is not None:
                values.append(vector)
                labels.append(sentiment)
                counter += 1

        except TextTooShortException as e:
            pass
예제 #2
0
def timed_dataload(data, args, values, labels):

    # use separate counter to account for invalid input along the way
    counter = 0

    for text,sentiment in data:

        try:
            if (counter % 10000 == 0):
                print("Loading at {}".format(counter))

            # normalize and tokenize if necessary
            if args.has_key('normalize'):
                text_normalized = data_utils.normalize(text, **args['normalize'])
            else:
                text_normalized = text

            # tokenize
            if data_args.get('load', {}).get('form', None) == 'hanzi':
                tokens = data_utils.tokenize_hanzi(text_normalized)
            else:
                tokens = data_utils.tokenize(text_normalized)

            # choose embedding type
            vector = None
            if args['embed']['type'] == 'concatenated':
                vector = embedder.embed_words_into_vectors_concatenated(tokens, **self.args['embed'])
            elif args['embed']['type'] == 'averaged':
                vector = embedder.embed_words_into_vectors_averaged(tokens)
            else:
                pass

            # data labeled by sentiment score (thread-safe with lock)
            if vector is not None:
                values.append(vector)
                labels.append(sentiment)
                counter += 1

        except TextTooShortException as e:
            pass
예제 #3
0
    def run(self):

        # process valid entries
        if self.data[0]:

            # process valid data
            text, sentiment = self.data
            if text:
                try:
                    # normalize and tokenize if necessary
                    if self.args.has_key('normalize'):
                        text_normalized = data_utils.normalize(
                            text, **self.args['normalize'])
                    else:
                        text_normalized = text

                    # tokenize
                    tokens = data_utils.tokenize(text_normalized)

                    # choose embedding type
                    vector = None
                    if self.args['embed']['type'] == 'concatenated':
                        vector = embedder.embed_words_into_vectors_concatenated(
                            tokens, **self.args['embed'])
                    elif self.args['embed']['type'] == 'averaged':
                        vector = embedder.embed_words_into_vectors_averaged(
                            tokens)
                    else:
                        pass

                    # data labeled by sentiment score (thread-safe with lock)
                    if vector is not None:
                        threadLock.acquire()
                        values.append(vector)
                        labels.append(sentiment)
                        threadLock.release()

                except TextTooShortException as e:
                    pass
    def run(self):

        # process valid entries
        if self.data[0]:

            # process valid data
            text, sentiment = self.data
            if text:
                try:
                    # normalize and tokenize if necessary
                    if self.args.has_key('normalize'):
                        text_normalized = data_utils.normalize(text, **self.args['normalize'])
                    else:
                        text_normalized = text

                    # tokenize
                    tokens = data_utils.tokenize(text_normalized)

                    # choose embedding type
                    vector = None
                    if self.args['embed']['type'] == 'concatenated':
                        vector = embedder.embed_words_into_vectors_concatenated(tokens, **self.args['embed'])
                    elif self.args['embed']['type'] == 'averaged':
                        vector = embedder.embed_words_into_vectors_averaged(tokens)
                    else:
                        pass

                    # data labeled by sentiment score (thread-safe with lock)
                    if vector is not None:
                        threadLock.acquire()
                        values.append(vector)
                        labels.append(sentiment)
                        threadLock.release()

                except TextTooShortException as e:
                    pass
예제 #5
0
def normalize_imdb(txt):
    return data_utils.normalize(txt, encoding=None)
예제 #6
0
def normalize_tweet(txt):
    return data_utils.normalize(txt, min_length=70, max_length=150)
예제 #7
0
def normalize_imdb(txt):
    return data_utils.normalize(txt, encoding=None)
예제 #8
0
def normalize_tweet(txt):
    return data_utils.normalize(txt, min_length=70, max_length=150)