def gen_feature(atuple): text = re.sub(r'https?://([a-zA-Z0-9\.\-_]+)[\w\-\._~:/\?#@!\$&\'\*\+,;=%%]*', '\\1', atuple['text'], flags=re.MULTILINE) featurize = {} bngram = ngram.get_byte_ngrams(text) for n in bngram['ngram_byte']: for k in bngram['ngram_byte'][n]: featurize[('nb%d' % n, k)] = bngram['ngram_byte'][n][k] for n in bngram['ngram_byte_cs']: for k in bngram['ngram_byte_cs'][n]: featurize[('nbcs%d' % n, k)] = bngram['ngram_byte_cs'][n][k] wngram = ngram.get_word_ngrams(text) for n in wngram['ngram_word']: for k in wngram['ngram_word'][n]: featurize[('nw%d' % n, ' '.join(k))] = wngram['ngram_word'][n][k] for n in wngram['ngram_word_clean']: for k in wngram['ngram_word_clean'][n]: featurize[('nwc%d' % n, ' '.join(k))] = wngram['ngram_word_clean'][n][k] words = ngram.get_word_ngram(text, n=1, clean=False) words = { k[0]: words[k] for k in words} for word in words: featurize[('w', word)] = words[word] clean_words = ngram.get_word_ngram(text, n=1, clean=True) clean_words = { k[0]: clean_words[k] for k in clean_words} for word in clean_words: featurize[('cw', word)] = clean_words[word] lex = lexical.get_symbol_dist(text) for k in lex['lex']: featurize[('l', k)] = lex['lex'][k] featurize = feature_to_numeric(featurize) featurize = [(k, featurize[k]) for k in featurize] featurize = sorted(featurize, key=itemgetter(0)) vector = ' '.join(['%d:%d' % (i, j) for i, j in featurize]) return (atuple['id'], vector)
def gen_feature(atuple): text = re.sub( r'https?://([a-zA-Z0-9\.\-_]+)[\w\-\._~:/\?#@!\$&\'\*\+,;=%%]*', '\\1', atuple['text'], flags=re.MULTILINE) featurize = {} bngram = ngram.get_byte_ngrams(text) for n in bngram['ngram_byte']: for k in bngram['ngram_byte'][n]: featurize[('nb%d' % n, k)] = bngram['ngram_byte'][n][k] wngram = ngram.get_word_ngrams(text) for n in wngram['ngram_word_clean']: for k in wngram['ngram_word_clean'][n]: featurize[('nwc%d' % n, ' '.join(k))] = wngram['ngram_word_clean'][n][k] words = ngram.get_word_ngram(text, n=1, clean=False) words = {k[0]: words[k] for k in words} for word in words: featurize[('w', word)] = words[word] lex = lexical.get_symbol_dist(text) for k in lex['lex']: featurize[('l', k)] = lex['lex'][k] featurize = feature_to_numeric(featurize) featurize = [(k, featurize[k]) for k in featurize] featurize = sorted(featurize, key=itemgetter(0)) vector = ' '.join(['%d:%d' % (i, j) for i, j in featurize]) return (atuple['id'], vector)
def gen_feature(atuple): text = re.sub( r'https?://([a-zA-Z0-9\.\-_]+)[\w\-\._~:/\?#@!\$&\'\*\+,;=%%]*', '\\1', atuple['text'], flags=re.MULTILINE) aset = set() bngram = ngram.get_byte_ngrams(text) for n in bngram['ngram_byte']: for k in bngram['ngram_byte'][n]: aset.add(('nb%d' % n, k)) for n in bngram['ngram_byte_cs']: for k in bngram['ngram_byte_cs'][n]: aset.add(('nbcs%d' % n, k)) wngram = ngram.get_word_ngrams(text) for n in wngram['ngram_word']: for k in wngram['ngram_word'][n]: aset.add(('nw%d' % n, ' '.join(k))) for n in wngram['ngram_word_clean']: for k in wngram['ngram_word_clean'][n]: aset.add(('nwc%d' % n, ' '.join(k))) words, clean_words = ngram.get_words(text) for word in words: aset.add(('w', word)) for word in clean_words: aset.add(('cw', word)) lex = lexical.get_symbol_dist(text) for k in lex['lex']: aset.add(('l', k)) return set(aset)
def gen_feature(atuple): text = re.sub(r'https?://([a-zA-Z0-9\.\-_]+)[\w\-\._~:/\?#@!\$&\'\*\+,;=%%]*', '\\1', atuple['text'], flags=re.MULTILINE) aset = set() bngram = ngram.get_byte_ngrams(text) for n in bngram['ngram_byte']: for k in bngram['ngram_byte'][n]: aset.add(('nb%d' % n, k)) for n in bngram['ngram_byte_cs']: for k in bngram['ngram_byte_cs'][n]: aset.add(('nbcs%d' % n, k)) wngram = ngram.get_word_ngrams(text) for n in wngram['ngram_word']: for k in wngram['ngram_word'][n]: aset.add(('nw%d' % n, ' '.join(k))) for n in wngram['ngram_word_clean']: for k in wngram['ngram_word_clean'][n]: aset.add(('nwc%d' % n, ' '.join(k))) words, clean_words = ngram.get_words(text) for word in words: aset.add(('w', word)) for word in clean_words: aset.add(('cw', word)) lex = lexical.get_symbol_dist(text) for k in lex['lex']: aset.add(('l', k)) return set(aset)