예제 #1
0
def parsedata(lines,
              word_list,
              split_word_list,
              emoji_dict,
              abbreviation_dict,
              normalize_text=False,
              split_hashtag=False,
              ignore_profiles=False,
              lowercase=False,
              replace_emoji=True,
              n_grams=None,
              at_character=False):
    data = []
    for i, line in enumerate(lines):
        if (i % 100 == 0):
            print(str(i) + '...', end='', flush=True)

        try:

            # convert the line to lowercase
            if (lowercase):
                line = line.lower()

            # split into token
            token = line.split('\t')

            # ID
            id = token[0]

            # label
            label = int(token[1].strip())

            # tweet text
            target_text = TweetTokenizer().tokenize(token[2].strip())
            if (at_character):
                target_text = [c for c in token[2].strip()]

            if (n_grams != None):
                n_grams_list = list(
                    create_ngram_set(target_text, ngram_value=n_grams))
                target_text.extend(['_'.join(n) for n in n_grams_list])

            # filter text
            target_text = filter_text(target_text,
                                      word_list,
                                      split_word_list,
                                      emoji_dict,
                                      abbreviation_dict,
                                      normalize_text,
                                      split_hashtag,
                                      ignore_profiles,
                                      replace_emoji=replace_emoji)

            # awc dimensions
            dimensions = []
            if (len(token) > 3 and token[3].strip() != 'NA'):
                dimensions = [
                    dimension.split('@@')[1]
                    for dimension in token[3].strip().split('|')
                ]

            # context tweet
            context = []
            if (len(token) > 4):
                if (token[4] != 'NA'):
                    context = TweetTokenizer().tokenize(token[4].strip())
                    context = filter_text(context,
                                          word_list,
                                          split_word_list,
                                          emoji_dict,
                                          abbreviation_dict,
                                          normalize_text,
                                          split_hashtag,
                                          ignore_profiles,
                                          replace_emoji=replace_emoji)

            # author
            author = 'NA'
            if (len(token) > 5):
                author = token[5]

            if (len(target_text) != 0):
                # print((label, target_text, dimensions, context, author))
                data.append(
                    (id, label, target_text, dimensions, context, author))
        except:
            raise
    print('')
    return data
예제 #2
0
def parse_sent(sent,
               word_file_path,
               split_word_path,
               emoji_file_path,
               vocab,
               normalize_text=False,
               split_hashtag=False,
               ignore_profiles=False,
               lowercase=False,
               replace_emoji=True,
               n_grams=None,
               at_character=False):

    target_text = TweetTokenizer().tokenize(sent.strip())
    if (at_character):
        target_text = [c for c in sent.strip()]

    if (n_grams != None):
        n_grams_list = list(create_ngram_set(target_text, ngram_value=n_grams))
        target_text.extend(['_'.join(n) for n in n_grams_list])

    word_list = None
    emoji_dict = None

    # load split files
    split_word_list = load_split_word(split_word_path)

    # load word dictionary
    if (split_hashtag):
        word_list = InitializeWords(word_file_path)

    if (replace_emoji):
        emoji_dict = load_unicode_mapping(emoji_file_path)
    abbreviation_dict = load_abbreviation()

    # filter text
    target_text = filter_text(target_text,
                              word_list,
                              split_word_list,
                              emoji_dict,
                              abbreviation_dict,
                              normalize_text,
                              split_hashtag,
                              ignore_profiles,
                              replace_emoji=replace_emoji)

    known_words_set = set()
    unknown_words_set = set()

    tokens = 0
    token_coverage = 0
    vec = []

    # tweet
    for words in target_text:
        tokens = tokens + 1
        if words in vocab:
            vec.append(vocab[words])
            token_coverage = token_coverage + 1
            known_words_set.add(words)
        else:
            vec.append(vocab['unk'])
            unknown_words_set.add(words)

    return numpy.asarray([vec])
예제 #3
0
def parsedata(dataset, word_list, split_word_list, emoji_dict, abbreviation_dict, normalize_text=False,
              split_hashtag=False,
              ignore_profiles=False,
              lowercase=False, replace_emoji=True, n_grams=None, at_character=False):
    data = []

    columns = list(dataset.columns)

    for i, tuple in enumerate(dataset.itertuples()):
        # for i, line in enumerate(lines):
        if (i % 100 == 0):
            print(str(i) + '...', end='', flush=True)

        try:

            # split into token
            # token = line.split('\t')
            # print(token)

            # ID
            # id = token[0]
            id = tuple.ID

            # label
            # label = int(token[1].strip())
            label = tuple.label

            target_text = tuple.text

            # convert the line to lowercase
            if (lowercase):
                target_text = target_text.lower()

            # tweet text
            # target_text = TweetTokenizer().tokenize(token[2].strip())
            target_text = TweetTokenizer().tokenize(target_text.strip())

            if (n_grams != None):
                n_grams_list = list(create_ngram_set(target_text, ngram_value=n_grams))
                target_text.extend(['_'.join(n) for n in n_grams_list])

            # filter text
            target_text = filter_text(target_text, word_list, split_word_list, emoji_dict, abbreviation_dict,
                                      normalize_text,
                                      split_hashtag,
                                      ignore_profiles, replace_emoji=replace_emoji)
            # split at character
            if (at_character):
                target_text = [c for c in ' '.join(target_text)]

            # awc dimensions
            dimensions = []
            # if (len(token) > 3 and token[3].strip() != 'NA'):
            #     dimensions = [dimension.split('@@')[1] for dimension in token[3].strip().split('|')]
            if ('psychological_dimension' in columns and tuple.psychological_dimension.strip() != 'NA'):
                dimensions = [dimension.split('@@')[1] for dimension in
                              tuple.psychological_dimension.strip().split('|')]

            # context tweet
            context = []
            # if (len(token) > 4):
            #     if (token[4] != 'NA'):
            #         context = TweetTokenizer().tokenize(token[4].strip())
            #         context = filter_text(context, word_list, split_word_list, emoji_dict, abbreviation_dict,
            #                               normalize_text,
            #                               split_hashtag,
            #                               ignore_profiles, replace_emoji=replace_emoji)

            if ('context' in columns):
                if (tuple.context != 'NA'):
                    context = tuple.context.strip()
                    # convert the line to lowercase
                    if (lowercase):
                        context = context.lower()

                    context = TweetTokenizer().tokenize(context)
                    context = filter_text(context, word_list, split_word_list, emoji_dict, abbreviation_dict,
                                          normalize_text,
                                          split_hashtag,
                                          ignore_profiles, replace_emoji=replace_emoji)

            # author
            author = 'NA'
            # if (len(token) > 5):
            #     author = token[5]

            if ('author' in columns):
                author = tuple.author

            if (len(target_text) != 0):
                # print((label, target_text, dimensions, context, author))
                data.append((id, label, target_text, dimensions, context, author))
            else:
                print(tuple)
        except:
            raise
    print('')
    return data