def pos_tag(inp: str, out, tagdir: str = '/usr/local/tree-tagger'):
    #Generates POS representation of data and pickles output into a jar.
    texts, genders, ages = read_data(inp)
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='en',
                                          TAGDIR=tagdir,
                                          TAGOPT='-token -sgml')
    logging.info('POS tagging data')

    pos_texts = []
    d_infothresholds = {
        int((i / 100.0 * len(texts))): "%i%%" % (i)
        for i in range(0, 101)
    }
    for i, t in enumerate(texts):
        tags = [
            el.split('\t')[1] for el in tagger.tag_text(t)
            if len(el.split()) == 2
        ]
        pos_texts.append(' '.join(tags))
        if i in d_infothresholds.keys():
            logging.info('{} of documents processed'.format(
                d_infothresholds[i]))

    logging.info('Pickling results to {}'.format(out.name))
    pickle.dump((pos_texts, genders, ages), out)
示例#2
0
def pos_tag(inp:str, out, tagdir:str='/usr/local/tree-tagger'):
    #Generates POS representation of data and pickles output into a jar.
    texts, genders, ages = read_data(inp)
    tagger = treetaggerwrapper.TreeTagger(TAGLANG='en',TAGDIR=tagdir, TAGOPT='-token -sgml')
    logging.info('POS tagging data')
    
    pos_texts = []
    d_infothresholds = {int((i/100.0*len(texts))):"%i%%"%(i) for i in range(0, 101)}
    for i, t in enumerate(texts):
        tags = [el.split('\t')[1] for el in tagger.tag_text(t) if len(el.split()) == 2]
        pos_texts.append(' '.join(tags))
        if i in d_infothresholds.keys():
            logging.info('{} of documents processed'.format(d_infothresholds[i]))
            
    logging.info('Pickling results to {}'.format(out.name))
    pickle.dump((pos_texts, genders, ages), out)
def extract_feats(data:str, feat_name:str, outpath:str, pos:str=False, **kwargs):
    """Extracts features from input data and writes them to disk
    """
    # input correction
    if outpath[-1] != '/': path = outpath + '/'
    else: path = outpath
    if pos and data == DEFAULT_TOKEN_JAR:
        src = DEFAULT_POS_JAR
    else: src = data
        
    texts, genders, ages = read_data(src)
    vect = FEAT_DICT[feat_name](**kwargs)

    if feat_name == 'ngram':
        logging.info('Generating ngram feature vectors (range:{}, max_features:{}, min_df:{}, max_df:{})'.format(kwargs['ngram_range'], str(kwargs['max_features']), str(kwargs['min_df']), str(kwargs['max_df'])))
    else:
        logging.info('Generating {} feature vectors'.format(feat_name))
    feats = vect.fit_transform(texts)

    logging.info('Writing feature matrix to disk')

    if feat_name == 'ngram':
        if pos:
            feat = 'pos_' + feat_name
        else:
            feat = feat_name
        fn = 'feat_matr_{}_n{}_{}'.format(feat,
                                         '-'.join(str(n) for n in kwargs['ngram_range']),
                                         datetime.now().strftime('%m%d_%H%M%S'))
    else:
        fn = 'feat_matr_{}_{}'.format(feat_name, datetime.now().strftime('%m%d_%H%M%S'))
    np.save(path + fn, feats)

    # write parameters (if any) to log file to avoid ridiculous file names
    if kwargs:
        with open(path + 'feats.log', 'a') as op:
            op.write('\n{} {} {}\t{}'.format(datetime.now().strftime('%d-%m-%y %H:%M:%S'), fn, feat_name, src))
            for key, val in sorted(kwargs.items()):
                op.write(' {}:{}'.format(key,val))