def gen_feature(atuple):
    text = re.sub(r'https?://([a-zA-Z0-9\.\-_]+)[\w\-\._~:/\?#@!\$&\'\*\+,;=%%]*',
                  '\\1', atuple['text'], flags=re.MULTILINE)
    featurize = {}
    bngram = ngram.get_byte_ngrams(text)
    for n in bngram['ngram_byte']:
        for k in bngram['ngram_byte'][n]:
            featurize[('nb%d' % n, k)] = bngram['ngram_byte'][n][k]
    for n in bngram['ngram_byte_cs']:
        for k in bngram['ngram_byte_cs'][n]:
             featurize[('nbcs%d' % n, k)] = bngram['ngram_byte_cs'][n][k]
    wngram = ngram.get_word_ngrams(text)
    for n in wngram['ngram_word']:
        for k in wngram['ngram_word'][n]:
             featurize[('nw%d' % n, ' '.join(k))] = wngram['ngram_word'][n][k]
    for n in wngram['ngram_word_clean']:
        for k in wngram['ngram_word_clean'][n]:
             featurize[('nwc%d' % n, ' '.join(k))] = wngram['ngram_word_clean'][n][k]
    words = ngram.get_word_ngram(text, n=1, clean=False)
    words = { k[0]: words[k] for k in words}
    for word in words:
        featurize[('w', word)] = words[word]
    clean_words = ngram.get_word_ngram(text, n=1, clean=True)
    clean_words = { k[0]: clean_words[k] for k in clean_words}
    for word in clean_words:
        featurize[('cw', word)] = clean_words[word]
    lex = lexical.get_symbol_dist(text)
    for k in lex['lex']:
        featurize[('l', k)] = lex['lex'][k]
    featurize = feature_to_numeric(featurize)
    featurize = [(k, featurize[k]) for k in featurize]
    featurize = sorted(featurize, key=itemgetter(0))
    vector = ' '.join(['%d:%d' % (i, j) for i, j in featurize])
    return (atuple['id'], vector)
示例#2
0
def gen_feature(atuple):
    text = re.sub(
        r'https?://([a-zA-Z0-9\.\-_]+)[\w\-\._~:/\?#@!\$&\'\*\+,;=%%]*',
        '\\1',
        atuple['text'],
        flags=re.MULTILINE)
    aset = set()
    bngram = ngram.get_byte_ngrams(text)
    for n in bngram['ngram_byte']:
        for k in bngram['ngram_byte'][n]:
            aset.add(('nb%d' % n, k))
    for n in bngram['ngram_byte_cs']:
        for k in bngram['ngram_byte_cs'][n]:
            aset.add(('nbcs%d' % n, k))
    wngram = ngram.get_word_ngrams(text)
    for n in wngram['ngram_word']:
        for k in wngram['ngram_word'][n]:
            aset.add(('nw%d' % n, ' '.join(k)))
    for n in wngram['ngram_word_clean']:
        for k in wngram['ngram_word_clean'][n]:
            aset.add(('nwc%d' % n, ' '.join(k)))
    words, clean_words = ngram.get_words(text)
    for word in words:
        aset.add(('w', word))
    for word in clean_words:
        aset.add(('cw', word))
    lex = lexical.get_symbol_dist(text)
    for k in lex['lex']:
        aset.add(('l', k))
    return set(aset)
示例#3
0
def gen_feature(atuple):
    text = re.sub(
        r'https?://([a-zA-Z0-9\.\-_]+)[\w\-\._~:/\?#@!\$&\'\*\+,;=%%]*',
        '\\1',
        atuple['text'],
        flags=re.MULTILINE)
    featurize = {}
    bngram = ngram.get_byte_ngrams(text)
    for n in bngram['ngram_byte']:
        for k in bngram['ngram_byte'][n]:
            featurize[('nb%d' % n, k)] = bngram['ngram_byte'][n][k]
    wngram = ngram.get_word_ngrams(text)
    for n in wngram['ngram_word_clean']:
        for k in wngram['ngram_word_clean'][n]:
            featurize[('nwc%d' % n,
                       ' '.join(k))] = wngram['ngram_word_clean'][n][k]
    words = ngram.get_word_ngram(text, n=1, clean=False)
    words = {k[0]: words[k] for k in words}
    for word in words:
        featurize[('w', word)] = words[word]
    lex = lexical.get_symbol_dist(text)
    for k in lex['lex']:
        featurize[('l', k)] = lex['lex'][k]
    featurize = feature_to_numeric(featurize)
    featurize = [(k, featurize[k]) for k in featurize]
    featurize = sorted(featurize, key=itemgetter(0))
    vector = ' '.join(['%d:%d' % (i, j) for i, j in featurize])
    return (atuple['id'], vector)
示例#4
0
def gen_feature(atuple):
    text = re.sub(r'https?://([a-zA-Z0-9\.\-_]+)[\w\-\._~:/\?#@!\$&\'\*\+,;=%%]*',
                  '\\1', atuple['text'], flags=re.MULTILINE)
    aset = set()
    bngram = ngram.get_byte_ngrams(text)
    for n in bngram['ngram_byte']:
        for k in bngram['ngram_byte'][n]:
            aset.add(('nb%d' % n, k))
    for n in bngram['ngram_byte_cs']:
        for k in bngram['ngram_byte_cs'][n]:
            aset.add(('nbcs%d' % n, k))
    wngram = ngram.get_word_ngrams(text)
    for n in wngram['ngram_word']:
        for k in wngram['ngram_word'][n]:
            aset.add(('nw%d' % n, ' '.join(k)))
    for n in wngram['ngram_word_clean']:
        for k in wngram['ngram_word_clean'][n]:
            aset.add(('nwc%d' % n, ' '.join(k)))
    words, clean_words = ngram.get_words(text)
    for word in words:
        aset.add(('w', word))
    for word in clean_words:
        aset.add(('cw', word))
    lex = lexical.get_symbol_dist(text)
    for k in lex['lex']:
        aset.add(('l', k))
    return set(aset)
def gen_feature(atuple):
    text = re.sub(r'https?://([a-zA-Z0-9\.\-_]+)[\w\-\._~:/\?#@!\$&\'\*\+,;=%%]*',
                  '\\1', atuple['text'], flags=re.MULTILINE)
    aset = set()
    wngram = ngram.get_word_ngrams(text)
    for n in wngram['ngram_word']:
        for k in wngram['ngram_word'][n]:
            aset.add(('npos%d' % n, ' '.join(k)))
    words, clean_words = ngram.get_words(text)
    for word in words:
        aset.add(('pos', word))
    return set(aset)
def gen_feature(atuple):
    text = re.sub(
        r'https?://([a-zA-Z0-9\.\-_]+)[\w\-\._~:/\?#@!\$&\'\*\+,;=%%]*',
        '\\1',
        atuple['text'],
        flags=re.MULTILINE)
    aset = set()
    wngram = ngram.get_word_ngrams(text)
    for n in wngram['ngram_word']:
        for k in wngram['ngram_word'][n]:
            aset.add(('npos%d' % n, ' '.join(k)))
    words, clean_words = ngram.get_words(text)
    for word in words:
        aset.add(('pos', word))
    return set(aset)