示例#1
0
def main(argv=None):
    args = oparse.parse_args(argv)
    corpus_name = args.corpus
    if args.end is not None:
        sent_start = args.start
        sent_end = args.end
    elif args.start is not None:
        sent_start = 0
        sent_end = args.start
    else:
        sent_start = 0
        sent_end = None
    columns = [None] * 14
    corpus = Corpus(corpus_name, registry_dir=CQP_REGISTRY)
    columns[0] = corpus.attribute('word', 'p')
    sent_attr = corpus.attribute('s', 's')
    if args.fmt == 'conll':
        idx = 1
        for col in args.xcolumns:
            if '=' in col:
                s_idx, att_name = col.split('=')
                s_idx = int(s_idx)
            else:
                s_idx = idx
                att_name = col
            columns[s_idx] = corpus.attribute(att_name, 'p')
            idx = s_idx + 1
        output_sentences(sent_attr, columns, sent_start, sent_end)
    elif args.fmt == 'line':
        output_sentences_line(sent_attr, columns, sent_start, sent_end)
    elif args.fmt == 'bllip':
        output_sentences_bllip(sent_attr, columns, sent_start, sent_end,
                               corpus_name=corpus_name, max_len=args.max_len)
示例#2
0
def con_source(request, qpos):
    window_size = 100
    corp_name, start, end = cy.decrypt(qpos)
    start, end = int(start), int(end)
    corpus = Corpus(corp_name.upper(),
                    registry_dir='/usr/local/share/cwb/registry')
    words = corpus.attribute('word', 'p')
    corp_len = len(words)
    if start - window_size < 0:
        lb = 0
    else:
        lb = start - window_size
    if end + window_size > corp_len:
        rb = corp_len - 1
    else:
        rb = end + window_size

    lw = ''.join(words[lb:start])
    qw = '<span style="color:red;font-size:24px;">' + ''.join(
        words[start:end]) + '</span>'
    rw = ''.join(words[end:rb])
    if corp_name == 'tccm' or corp_name == 'ntuspk':
        if corp_name == 'tccm':
            s_attrs = corpus.attribute('s_addresser', 's')
        if corp_name == 'ntuspk':
            s_attrs = corpus.attribute('s_speaker', 's')
        top = s_attrs.cpos2struc(lb)
        top = s_attrs[top]
        bottom = s_attrs.cpos2struc(rb)
        bottom = s_attrs[bottom]

        attr_con = []
        for attr in s_attrs:
            if attr[0] >= top[0] and attr[1] <= bottom[1]:
                attr_con.append(attr)
        output = ''
        for a in attr_con:
            if start in xrange(a[0], a[1]):
                sent =\
                a[-1] + ': ' +\
                ' '.join(words[a[0]:start]) + ' ' +\
                '<span style="color:red;font-size:24px;">' + ' '.join(words[start:end]) + '</span>' + ' ' +\
                ' '.join(words[end:a[1]])
            else:
                sent = '%s: %s' % (a[-1], ' '.join(words[a[0]:a[1]]))
            output += sent + '<br>'


#        output = ['%s: %s' % (i[-1], ' '.join(words[i[0]:i[1]])) for i in attr_con]
#        output = '<br>'.join(output)
        return HttpResponse(output)

    return HttpResponse(lw + qw + rw)
示例#3
0
def con_source(qpos):
    """Concordance source."""
    window_size = 100
    corp_name, start, end = qpos.split('_')
    start, end = int(start), int(end)
    registry_dir = CONF.get('main', 'registry_dir')
    corpus = Corpus(corp_name.upper(), registry_dir=registry_dir)
    words = corpus.attribute('word', 'p')
    corp_len = len(words)
    if start - window_size < 0:
        lb = 0
    else:
        lb = start - window_size
    if end + window_size > corp_len:
        rb = corp_len - 1
    else:
        rb = end + window_size

    lw = ''.join(words[lb:start])
    qw = '<span style="color:red;font-size:24px;">' + ''.join(
        words[start:end]) + '</span>'
    rw = ''.join(words[end:rb])
    if corp_name == 'tccm' or corp_name == 'ntuspk':
        if corp_name == 'tccm':
            s_attrs = corpus.attribute('s_addresser', 's')
        if corp_name == 'ntuspk':
            s_attrs = corpus.attribute('s_speaker', 's')
        top = s_attrs.cpos2struc(lb)
        top = s_attrs[top]
        bottom = s_attrs.cpos2struc(rb)
        bottom = s_attrs[bottom]

        attr_con = []
        for attr in s_attrs:
            if attr[0] >= top[0] and attr[1] <= bottom[1]:
                attr_con.append(attr)
        output = ''
        for a in attr_con:
            if start in xrange(a[0], a[1]):
                sent =\
                    a[-1] + ': ' +\
                    ' '.join(words[a[0]:start]) + ' ' +\
                    '<span style="color:red;font-size:24px;">' +\
                    ' '.join(words[start:end]) + '</span>' + ' ' +\
                    ' '.join(words[end:a[1]])
            else:
                sent = '%s: %s' % (a[-1], ' '.join(words[a[0]:a[1]]))
            output += sent + '<br>'

        return output

    return lw + qw + rw
示例#4
0
class CorpusInfo:
    def __init__(self, corpus_name):
        self.name=corpus_name
        self.corpus=Corpus(corpus_name, registry_dir=CQP_REGISTRY)
        self.words=self.corpus.attribute('word','p')
        self.sentences=self.corpus.attribute('s','s')
        id_to_start={}
        text_ids=self.corpus.attribute('file_id','s')
        for start, end, fname in text_ids:
            id_to_start[fname]=start
        self.id_to_start=id_to_start
    def __getitem__(self,fname):
        return self.sentences.cpos2struc(self.id_to_start[fname])
示例#5
0
class CorpusInfo:
    def __init__(self, corpus_name):
        self.name = corpus_name
        self.corpus = Corpus(corpus_name, registry_dir=CQP_REGISTRY)
        self.words = self.corpus.attribute('word', 'p')
        self.sentences = self.corpus.attribute('s', 's')
        id_to_start = {}
        text_ids = self.corpus.attribute('file_id', 's')
        for start, end, fname in text_ids:
            id_to_start[fname] = start
        self.id_to_start = id_to_start

    def __getitem__(self, fname):
        return self.sentences.cpos2struc(self.id_to_start[fname])
示例#6
0
def create_bow_tag(corpora, language, pos_tags, outdir='.', alph_suffix=''):
    # Step 1: extract unigram distributions for words
    unigram_alph = CPPUniAlphabet()
    unigram_alph.fromfile(
        file(os.path.join(outdir, 'unigram%s_alph.txt' % (alph_suffix, ))))
    unigram_alph.growing = False
    bigram_alph = CPPUniAlphabet()
    bigram_alph.fromfile(
        file(os.path.join(outdir, 'bigram%s_alph.txt' % (alph_suffix, ))))
    bigram_alph.growing = False
    infix = '_'.join(prefix_l)
    if infix != '': infix = '_' + infix
    if opts.limit != -1:
        prefix_l.append('%d' % (opts.limit / 1000))
    word_matrix = None
    word_alphs = get_word_alphs_by_pos(language)
    for word_pos in pos_tags:
        word_alph = word_alphs[word_pos]
        word_feat_alph = CPPUniAlphabet()
        for corpus_name in corpora:
            corpus = Corpus(corpus_name)
            att = corpus.attribute(opts.attr_name, 'p')
            att_find = corpus.attribute('tb_lemma', 'p')
            att_sent = corpus.attribute('s', 's')
            pair_alphs = get_pair_alphs_by_pos(opts.language)
            word_alphs = get_word_alphs_by_pos(opts.language)
            print "word features for %s in %s" % (word_pos, corpus_name)
            wmat = gather_word_vectors(list(word_alph), att, att_find,
                                       att_sent, unigram_alph, bigram_alph,
                                       word_feat_alph,
                                       forward_mapping_by_pos(word_pos),
                                       opts.limit)
            if word_matrix is None:
                word_matrix = wmat
            else:
                word_matrix += wmat
        word_feat_alph.tofile_utf8(
            file(
                os.path.join(opts.outdir, 'word_bow%s%s_alph.txt' % (
                    infix,
                    word_pos,
                )), 'w'))
        word_matrix.write_binary(
            file(
                os.path.join(opts.outdir, 'word_bow%s%s_mtx.bin' % (
                    infix,
                    word_pos,
                )), 'w'))
示例#7
0
def make_bigram_alph(corpora, attr_name='word', suffix='', outdir='.'):
    unigram_freqs = defaultdict(int)
    bigram_freqs = defaultdict(int)
    for corpus_name in corpora:
        print >> sys.stderr, "Reading corpus: %s" % (corpus_name, )
        corpus = Corpus(corpus_name)
        att = corpus.attribute(attr_name, 'p')
        unigram_list, bigram_list = make_frequencies(att)
        for v, k in unigram_list:
            unigram_freqs[k] += v
        for v, k in bigram_list:
            bigram_freqs[k] += v
    unigram_list = [(v, k) for (k, v) in unigram_freqs.iteritems()]
    unigram_list.sort()
    del unigram_list[MAX_LIST:]
    unigram_alph = CPPAlphabet()
    for c, k in unigram_list:
        unigram_alph[k]
    unigram_alph.tofile(
        file(os.path.join(outdir, 'unigram%s_alph.txt' % (suffix, )), 'w'))
    bigram_list = [(v, k) for (k, v) in bigram_freqs.iteritems()]
    bigram_list.sort()
    del bigram_list[MAX_LIST:]
    bigram_alph = CPPAlphabet()
    for c, k in bigram_list:
        bigram_alph[k]
    bigram_alph.tofile(
        file(os.path.join(outdir, 'bigram%s_alph.txt' % (suffix, )), 'w'))
示例#8
0
def cqp2vocab_main(argv=None):
    opts,args=oparse.parse_args(argv)
    frequencies=defaultdict(int)
    for arg in args:
        crp=Corpus(arg,registry_dir=CQP_REGISTRY)
        att=crp.attribute(opts.attr,'p')
        if opts.encoding is not None and crp.get_encoding()!=opts.encoding:
            print >>sys.stderr, "Recoding %s items from %s to %s"%(
                arg, crp.get_encoding(), opts.encoding)
            to_uni=crp.to_unicode
            enc=opts.encoding
            recode=lambda w: to_uni(w).encode(enc)
        else:
            recode=lambda x: x
        dic=att.getDictionary()
        for i in xrange(len(dic)):
            word=dic.get_word(i)
            frequencies[recode(word)]+=att.frequency(word)
    for word in frequencies.keys():
        if frequencies[word]<opts.threshold:
            del frequencies[word]
    if opts.out_fname is None:
        f_out=sys.stdout
    else:
        f_out=file(opts.out_fname,'w')
    for word in sorted(frequencies):
        print >>f_out, word
示例#9
0
def cqp2vocab_main(argv=None):
    opts, args = oparse.parse_args(argv)
    frequencies = defaultdict(int)
    for arg in args:
        crp = Corpus(arg, registry_dir=CQP_REGISTRY)
        att = crp.attribute(opts.attr, 'p')
        if opts.encoding is not None and crp.get_encoding() != opts.encoding:
            print >> sys.stderr, "Recoding %s items from %s to %s" % (
                arg, crp.get_encoding(), opts.encoding)
            to_uni = crp.to_unicode
            enc = opts.encoding
            recode = lambda w: to_uni(w).encode(enc)
        else:
            recode = lambda x: x
        dic = att.getDictionary()
        for i in xrange(len(dic)):
            word = dic.get_word(i)
            frequencies[recode(word)] += att.frequency(word)
    for word in frequencies.keys():
        if frequencies[word] < opts.threshold:
            del frequencies[word]
    if opts.out_fname is None:
        f_out = sys.stdout
    else:
        f_out = file(opts.out_fname, 'w')
    for word in sorted(frequencies):
        print >> f_out, word
示例#10
0
def main(argv=None):
    (opts, args) = oparse.parse_args(argv)
    if not args:
        oparse.print_help()
        sys.exit(1)
    corpus_name = args[0]
    if len(args) == 3:
        sent_start = int(args[1])
        sent_end = int(args[2])
    elif len(args) == 2:
        sent_start = 0
        sent_end = int(args[1])
    else:
        sent_start = 0
        sent_end = None
    columns = [None] * 14
    corpus = Corpus(corpus_name, registry_dir=CQP_REGISTRY)
    columns[0] = corpus.attribute('word', 'p')
    sent_attr = corpus.attribute('s', 's')
    if opts.fmt == 'conll':
        idx = 1
        for col in opts.xcolumns:
            if '=' in col:
                s_idx, att_name = col.split('=')
                s_idx = int(s_idx)
            else:
                s_idx = idx
                att_name = col
            columns[s_idx] = corpus.attribute(att_name, 'p')
            idx = s_idx + 1
        output_sentences(sent_attr, columns, sent_start, sent_end)
    elif opts.fmt == 'line':
        output_sentences_line(sent_attr, columns, sent_start, sent_end)
    elif opts.fmt == 'bllip':
        output_sentences_bllip(sent_attr,
                               columns,
                               sent_start,
                               sent_end,
                               corpus_name=corpus_name,
                               max_len=opts.max_len)
示例#11
0
    def find(self, token, show_pos=False, rsize=None):
        """Get concordance of a word."""
        if isinstance(token, unicode):
            token = token.encode('utf-8')
        else:
            try:
                token.decode('utf-8')
            except BaseException:
                raise UnicodeError('Encoding error!')
        self.conclst = []
        registry_dir = CONF.get('main', 'registry_dir')
        cqp = PyCQP_interface.CQP(bin=CONF.get('main', 'cqp_bin'),
                                  options='-c -r ' + registry_dir)
        cqp.Exec(self.corpus_name.upper() + ";")

        if token.startswith('cql:'):
            token = token[4:]
            cqp.Query(token)
        elif token.startswith('ncql:'):
            token = token[5:]
            token = convert_cql(token)
            cqp.Query(token)
        else:
            cqp.Query('[word="%s"];' % token)

        _rsize = int(cqp.Exec("size Last;"))
        if rsize is None:
            rsize = _rsize
        else:
            if rsize > _rsize:
                rsize = _rsize

        self.results = cqp.Dump(first=0, last=rsize)

        cqp.Terminate()
        if self.results == [['']]:
            return 'nores'

        corpus = Corpus(self.corpus_name, registry_dir=registry_dir)
        words = corpus.attribute("word", "p")

        with open(registry_dir + '/' + self.corpus_name) as f:
            cqpreg = f.read()
            s_attrs = re.findall('STRUCTURE\s(\w+)', cqpreg)

        s_attrs_dic = {}
        for attr in s_attrs:
            if attr != 's':
                s_attrs_dic[attr] = corpus.attribute(attr, "s")
            if show_pos == 1:
                postags = corpus.attribute("pos", "p")
            elif show_pos == 0:
                pass

        for line in self.results:
            output = dict()
            start = int(line[0])
            end = int(line[1]) + 1

            lw = words[start - self.window_size:start]
            if rsize < self.window_size:
                rw = words[end:end + rsize]
            else:
                rw = words[end:end + self.window_size]
            qw = words[start:end]

            if show_pos == 1:
                lp = postags[start - self.window_size:start]
                rp = postags[end:end + self.window_size]
                qp = postags[start:end]

                left = ' '.join([
                    '%s<span>/%s</span>' % (word, pos)
                    for word, pos in zip(lw, lp)
                ])
                mid = ' '.join([
                    '%s<span>/%s</span>' % (word, pos)
                    for word, pos in zip(qw, qp)
                ])
                right = ' '.join([
                    '%s<span>/%s</span>' % (word, pos)
                    for word, pos in zip(rw, rp)
                ])

            elif show_pos == 0:
                left = ' '.join(['%s' % word for word in lw])
                mid = ' '.join(['%s' % word for word in qw])
                right = ' '.join(['%s' % word for word in rw])

            metainfo = dict()
            for k in s_attrs_dic.iterkeys():
                metainfo[k] = s_attrs_dic[k].find_pos(start)[-1]
            output['conc'] = (left, mid, right)
            output['corp_name'] = DB[self.corpus_name]
            output['metainfo'] = metainfo
            output['qpos'] = '%s_%s_%s' % (self.corpus_name, start, end)
            self.conclst.append(output)
示例#12
0
class Counts:
    """
    returns df_counts:
    def: (p_att_1, p_att_2, ...), freq
    all p_atts are strings, " "-delimited for MWUs (split=NO)

    attributes:
    .corpus_name
    .attributes

    methods:
    ._cpos2patts

    .cpos      (cpos_list, p_atts)

    .dump      (df_dump, start, end, p_atts, split)
      - strategy 1: split NO|YES; flags  ; combo x
      - strategy 2: split   |YES; flags  ; combo

    .matches   (name, p_att, split, flags)
      - strategy 1: split NO|   ; flags x; combo
      - strategy 2: split NO|YES; flags x; combo
      - strategy 3: split   |YES; flags  ; combo x

    .mwus      (queries)
      - strategy 1: split NO| - ; flags x; combo x; mwu NO
      - strategy 2: split NO| - ; flags x; combo x; mwu YES
      - strategy 3: split NO| - ; flags x; combo  ; mwu YES

    TODO: counting with group?

    """
    def __init__(self,
                 corpus_name,
                 registry_path='/usr/local/share/cwb/registry/'):

        self.corpus_name = corpus_name
        self.registry_path = registry_path
        self.attributes = Crps(self.corpus_name, registry_dir=registry_path)

    def _cpos2patts(self, cpos, p_atts=['word'], ignore_missing=True):
        """Retrieves p-attributes of corpus position.

        :param int cpos: corpus position to fill
        :param list p_atts: p-attribute(s) to fill position with
        :param bool ignore_missing: whether to return -1 for out-of-bounds

        :return: p_att(s)
        :rtype: tuple

        """

        if ignore_missing and cpos == -1:
            token = [None] * len(p_atts)
        else:
            token = [
                self.attributes.attribute(p_att, 'p')[cpos] for p_att in p_atts
            ]

        return tuple(token)

    def cpos(self, cpos_list, p_atts=['word']):
        """Creates a frequency table for the p_att-values of the cpos-list.

        :param list cpos_list: corpus positions to fill
        :param list p_atts: p-attribute (combinations) to count

        :return: counts of the p_attribute (combinations) of the positions
        :rtype: DataFrame

        """
        lex_items = [self._cpos2patts(p, p_atts=p_atts) for p in cpos_list]
        counts = Counter(lex_items)
        df_counts = DataFrame.from_dict(counts,
                                        orient='index',
                                        columns=['freq'])
        df_counts.index = MultiIndex.from_tuples(df_counts.index, names=p_atts)
        return df_counts

    @time_it
    def dump(self,
             df_dump,
             start='match',
             end='matchend',
             p_atts=['word'],
             split=False,
             strategy=2):
        """Counts tokens in [start .. end] (columns in df_dump).

        :param list df_dump: corpus positions to fill
        :param str start: column name where to start counting
        :param str end: column name where to end counting
        :param list p_atts: p-attribute (combinations) to count
        :param bool split: token-based count? (default: MWU)
        :param int strategy: strategy 2 (cwb-scan-corpus) is faster,
                             does not support MWU counts though

        :return: counts of the p_attribute (combinations) of the positions
        :rtype: DataFrame

        """

        # choose strategy
        if strategy == 2 and not split:
            logger.warning("dump: cannot use cwb-scan-corpus for MWUs")
            strategy = 1
        logger.info("dump: strategy %d" % strategy)

        df_dump = df_dump.reset_index()  # for working with match, matchend

        if strategy == 1:

            logger.info("... extracting tokens")
            ls = df_dump.apply(
                lambda x: [
                    self._cpos2patts(cpos, p_atts)
                    for cpos in range(x[start], x[end] + 1)
                ],
                axis=1
            ).values  # list of list of tuples (p_att_1, p_att_2, ...)

            logger.info("... splitting")
            if split:
                tokens = [token for tokens in ls for token in tokens]
            else:
                tokens = [
                    tuple([" ".join(m) for m in zip(*mwu_list)])
                    for mwu_list in ls
                ]

            logger.info("... counting")
            counts = Counter(tokens)
            df_counts = DataFrame.from_dict(counts,
                                            orient='index',
                                            columns=['freq'])
            df_counts.index = MultiIndex.from_tuples(df_counts.index,
                                                     names=p_atts)

        elif strategy == 2:
            df_dump = df_dump.reset_index()
            with NamedTemporaryFile(mode="wt") as f:
                logger.info("... writing dump temporarily to disk")
                df_dump[[start, end]].to_csv(f.name,
                                             sep="\t",
                                             header=None,
                                             index=False)
                df_counts = cwb_scan_corpus(f.name, self.corpus_name,
                                            self.registry_path, p_atts)

        df_counts = df_counts.sort_values(by='freq', ascending=False)

        return df_counts

    @time_it
    def matches(self,
                cqp,
                name,
                p_atts=["word"],
                split=False,
                flags=None,
                strategy=3):
        """Counts tokens in [match .. matchend] of named subcorpus defined in
        running cqp.

        :param CQP cqp: running cqp process
        :param list name: name of the subcorpus
        :param list p_atts: p-attribute(-combinations) to count
        :param bool split: token-based count? (default: MWU)
        :param str flags: %c, %d, %cd

        :return: counts of the p_attribute (combinations) of the positions
        :rtype: DataFrame

        """

        # choose strategy
        combo = len(p_atts) > 1

        #    s f c
        # 1: - - -
        # 1: - x -
        # 2: - - -
        # 2: - x -
        # 2: x - -
        # 2: x x -
        # 3: x - -
        # 3: x - x

        # implemented:
        #    - - - 1,2
        #    - x - 1,2
        #    x - - 2,3
        #    x x - 2
        #    x - x 3

        # not implemented:
        #    - - x
        #    - x x
        #    x x x

        if combo:
            if flags or (not flags and not split):
                raise NotImplementedError(
                    "matches does not support parameter combination:",
                    str(" ".join([
                        'x' if x else '-'
                        for x in [split, len(flags) > 0, combo]
                    ])))

        if strategy == 1:
            if split or combo:
                logger.warning("matches: cannot use cqp-count")
                strategy = 2
        if strategy == 2:
            if combo:
                logger.warning("matches: cannot use cqp-tabulate")
                strategy = 3
        if strategy == 3:
            if flags or not split:
                logger.warning("matches: cannot use cwb-scan-corpus")
                strategy = 2
        logger.info("matches: strategy %s" % strategy)

        if strategy == 1:
            # split NO; flags NO/YES; combo NO
            # generally slow
            logger.info("... cqp is counting")
            cqp_return = cqp.Exec('count %s by %s %s;' %
                                  (name, p_atts[0], flags))
            df_counts = read_csv(StringIO(cqp_return),
                                 sep="\t",
                                 header=None,
                                 names=["freq", "unknown", "item"])
            df_counts = df_counts.set_index('item')
            df_counts = df_counts[['freq']]
            df_counts.index.name = p_atts[0]

        elif strategy == 2:
            # split NO/YES; flags NO/YES; combo NO
            # generally faster
            logger.info("... cqp is tabulating")
            cqp_return = cqp.Exec('tabulate %s match .. matchend %s %s;' %
                                  (name, p_atts[0], flags))
            logger.info("... splitting tokens")
            if split:  # split strings into tokens
                cqp_return = cqp_return.replace(" ", "\n")
            tokens = cqp_return.split("\n")
            logger.info("... counting %d tokens" % len(tokens))
            df_counts = DataFrame.from_dict(Counter(tokens),
                                            orient='index',
                                            columns=['freq'])
            df_counts = df_counts[['freq']]
            df_counts.index.name = p_atts[0]

        elif strategy == 3:
            # split YES; flags NO; combo YES
            # generally fastest
            with NamedTemporaryFile(mode="wt") as f:
                logger.info("... writing dump temporarily to disk")
                cqp.Exec('dump %s > "%s";' % (name, f.name))
                df_counts = cwb_scan_corpus(f.name, self.corpus_name,
                                            self.registry_path, p_atts)

        df_counts = df_counts.sort_values(by='freq', ascending=False)

        return df_counts

    @time_it
    def mwus(self, cqp, queries, p_atts=None, fill_missing=True, strategy=1):
        """Calculates frequencies for MWU queries in activated subcorpus.
        queries are a list of valid CQP queries, e.g.
        '[lemma="Angela"%cd & pos="NE"] [lemma="Merkel"%cd & pos="NE"]?'

        caveat: different indices for different strategies

        :param CQP cqp: running cqp process
        :param set queries: set of query strings to get frequency breakdown for
        :param bool fill_missing: count 0 for missing items?
        :param int strategy: strategy to use (see below)

        :return: counts (index: queries(strategy 1) or tokens (, column: freq)
        :rtype: DataFrame

        Strategy 1:
        for each item
            (1) run query for item
            (2) get size of corpus via cqp

        Strategy 2:
        (1) run query for all items at the same time
        (2) dump df
        (3) count_dump()

        Strategy 3:
        (1) run query for all items at the same time
        (2) count_matches()

        """

        queries = set(queries)  # only process each one query once
        name = 'tmp'  # subcorpus name to use

        if strategy == 1:
            if p_atts:
                logger.warning(
                    "mwus: cannot get frequency breakdown when not inspecting dump"
                )
                strategy = 2

        if not p_atts:
            p_atts = ['word']  # necessary for strategies 2 & 3

        if strategy == 3 and len(p_atts) > 1:
            logger.warning(
                "mwus: cannot combine query when looking at several p-attributes"
            )
            strategy = 2

        logger.info("mwus: strategy %s" % strategy)

        if strategy == 1:
            logger.info("... running each query")
            freqs = list()
            for query in queries:
                cqp.Exec('%s=%s;' % (name, query))
                freq = cqp.Exec('size %s;' % name)
                freqs.append(freq)
            df = DataFrame(data=freqs, index=queries, columns=['freq'])
            df.index.name = 'query'

        elif strategy == 2:
            query = "|".join(queries)
            cqp.Exec('%s=%s;' % (name, query))
            df_dump = cqp.Dump(name)
            df = self.dump(df_dump,
                           start='match',
                           end='matchend',
                           p_atts=p_atts,
                           split=False,
                           strategy=1)
            if len(p_atts) == 1:
                df.index = [item[0] for item in df.index]
                df.index.name = p_atts[0]

        elif strategy == 3:
            query = "|".join(queries)
            cqp.Exec('%s=%s;' % (name, query))
            df = self.matches(cqp,
                              name,
                              p_atts=p_atts,
                              split=False,
                              flags=None,
                              strategy=2)

        # post-process dataframe
        df["freq"] = df["freq"].astype(int)
        df = df.sort_values(by=["freq"], ascending=False)

        # df = df.loc[df["freq"] != 0]

        return df
示例#13
0
    def find(self,
             token,
             rsize=None,
             show_pos=False,
             begin_time=None,
             end_time=None,
             board_list=None):
        if begin_time:
            if not isinstance(begin_time, int):
                raise TypeError('"begin_time" should be an "int"')
        if end_time:
            if not isinstance(end_time, int):
                raise TypeError('"end_time" should be an "int"')
        if board_list:
            if not isinstance(board_list, list):
                raise TypeError('"board_list" should be a "list"')

        for i in xrange(2001, datetime.today().year + 1):
            self.freq_by_year[i] = 0
        self.conclst = []
        registry_dir = '/usr/local/share/cwb/registry'
        cqp = PyCQP_interface.CQP(bin='/usr/local/bin/cqp',
                                  options='-c -r ' + registry_dir)
        cqp.Exec(self.corpus_name + ";")
        if token.startswith('['):
            wildcard = '.'
            for i in reversed(range(1, 6)):
                token = token.replace(wildcard * i, wildcard * i * 3)
#                    token = re.sub('(\[word=")(%s)("\])' % (wildcard*i), '\\1%s\\3' % (wildcard*i*3), token)
#                else:
#                    token = re.sub('(\[word=")(%s)("\])' % (wildcard*i), '\\1\\2|%s\\3' % (wildcard*i*3), token)
            cqp.Query(token)
        else:
            cqp.Query('[word="%s"];' % token)
        if rsize == None:
            rsize = int(cqp.Exec("size Last;"))
        self.results = cqp.Dump(first=0, last=rsize)
        os.system('kill -9 $(pidof cqp)')
        if self.results == [['']]:
            return 'nores'

        corpus = Corpus(self.corpus_name, registry_dir=registry_dir)
        words = corpus.attribute("word", "p")
        if show_pos == True:
            postags = corpus.attribute("pos", "p")
        elif show_pos == False:
            pass
        else:
            raise

    #    sentences = corpus.attribute("s","s") -> find position in sentences (line number)
        ids = corpus.attribute("text_id", "s")
        boards = corpus.attribute("text_board", "s")
        ptimes = corpus.attribute("text_time", "s")
        for line in self.results:
            output = dict()
            start = int(line[0])
            end = int(line[1]) + 1

            # post_time filter
            ptime = ptimes.find_pos(start)[-1]
            if begin_time != None and end_time != None:
                if begin_time <= int(ptime) <= end_time:
                    pass
                else:
                    continue
            elif begin_time != None and end_time == None:
                if int(ptime) < begin_time:
                    continue

            elif begin_time == None and end_time != None:
                if int(ptime) > end_time:
                    continue

            # board_list filter
            board = boards.find_pos(start)[-1]

            if board_list:
                if board not in board_list:
                    continue

            lw = words[start - self.window_size:start]
            rw = words[end:end + self.window_size]
            qw = words[start:end]

            if show_pos is True:
                lp = postags[start - self.window_size:start]
                rp = postags[end:end + self.window_size]
                qp = postags[start:end]

                left = ' '.join([
                    '%s<span>/%s</span>' % (word, pos)
                    for word, pos in zip(lw, lp)
                ])
                mid = ' '.join([
                    '%s<span>/%s</span>' % (word, pos)
                    for word, pos in zip(qw, qp)
                ])
                right = ' '.join([
                    '%s<span>/%s</span>' % (word, pos)
                    for word, pos in zip(rw, rp)
                ])

            elif show_pos is False:
                left = ' '.join(['%s' % word for word in lw])
                mid = ' '.join(['%s' % word for word in qw])
                right = ' '.join(['%s' % word for word in rw])

#           s_bounds = sentences.find_pos(end-1)
            mongoid = ids.find_pos(start)[-1]
            if self.corpus_name == 'PTT':
                self.freq_by_year[int(ptime[:4])] += 1
            output['conc'] = (left, mid, right)
            output['board'] = board
            output['post_time'] = ptime
            output['mongoid'] = mongoid
            output['board_cht'] = BOARDREF[board]
            self.conclst.append(output)
        if self.time_order == -1:
            rev = True
        elif self.time_order == 1:
            rev = False
        else:
            raise ValueError('time order should be either 1 or -1')
        self.conclst.sort(key=lambda x: x['post_time'], reverse=rev)

        if self.corpus_name == 'PTT':
            for y in self.freq_by_year.iterkeys():
                if y in self.freq_by_year and y in toknumByYear:
                    self.freq_by_year[y] = self.freq_by_year[y] / toknumByYear[
                        str(y)]
示例#14
0
def f(corpus, query):
    """
    Envoi de la requête à CQP et mise en forme des données récupérées
        entrée : nom du corpus sur lequel la requête sera effectuée et la requête en question
        sortie : requête à soumettre à CQP
    """

    registry_dir = "/usr/local/share/cwb/registry"
    #cqp=PyCQP_interface.CQP(bin='/usr/local/bin/cqp',options='-c -r '+registry_dir)
    cqp = PyCQP_interface.CQP(bin='/usr/local/cwb/bin//cqp',
                              options='-c -r ' + registry_dir)
    corpus_name = splitext(basename(corpus))[0].upper()
    dep = corpus_name.split("_")[1].upper()
    if (re.match(r"^\d$", dep)):
        dep = "0" + dep
    else:
        dep = dep

    resultDep = []

    # Envoi de la requête
    cqp.Exec(corpus_name + ";")
    cqp.Query(query)
    cqp.Exec("sort Last by word;")
    """
        Récupération des résultats, sous la forme d'une liste (results) qui contient autant de listes que de résultats correspondant à la requête effectuée.
        Ces listes permettent de récupérer l'emplacement du premier et du dernier élément des motifs correspondants dans le corpus.
    """
    rsize = int(cqp.Exec("size Last;"))
    results = cqp.Dump(first=0, last=rsize)

    corpus = Corpus(corpus_name, registry_dir=registry_dir)

    # permettra de récupérer par la suite le token, la POS ou le lemme correspondant à la position indiquée
    words = corpus.attribute("word", "p")
    postags = corpus.attribute("pos", "p")
    lemmas = corpus.attribute("lemma", "p")

    sentences = corpus.attribute(b"text", "s")
    id = corpus.attribute(b"text_id", "s")
    dates = corpus.attribute(b"text_date", "s")
    geo = corpus.attribute(b"text_geo", "s")
    users = corpus.attribute(b"text_user", "s")

    cqp.Terminate()

    if (results != [[""]]):
        for r in results:
            left_context = []
            right_context = []
            start = int(r[0])
            end = int(r[1])

            # Récupération de la position du début et de la fin du tweet dans lequel le motif a été trouvé
            s_bounds = sentences.find_pos(end)
            # Récupérarion de ses attributs (id, date, coordonnées et id de l'utilisateur)
            id_bounds = id.find_pos(end)
            date_bounds = dates.find_pos(end)
            geo_bounds = geo.find_pos(end)
            user_bounds = users.find_pos(end)

            coord = geo_bounds[-1].decode("utf8").split(", ")

            # récupération de la position des mots des contextes droit et gauche
            for pos in range(s_bounds[0], s_bounds[1] + 1):
                if (pos < start):
                    left_context.append(pos)
                if (pos > end):
                    right_context.append(pos)

            # Construction du dictionnaire qui contiendra les informations qui nous intéressent
            result = {
                "id": id_bounds[-1],
                "date": date_bounds[-1].decode("utf8").split("T")[0],
                "geo": coord,
                "dep": dep,
                "user": user_bounds[-1],
                "hide_column": "",
                "left_context": "",
                "pattern": "",
                "right_context": ""
            }

            lc_tokens = []
            lc_pos = []
            lc_lemmas = []
            rc_tokens = []
            rc_pos = []
            rc_lemmas = []

            # récupération du contexte gauche (tokens, pos et lemmes)
            for lp in left_context:
                lc_tokens.append(words[lp])
                lc_pos.append(postags[lp])
                lc_lemmas.append(lemmas[lp])
            lc_tokens = reconstituteString(lc_tokens)
            lc_pos = " ".join(lc_pos)
            lc_lemmas = " ".join(lc_lemmas)

            # récupération du motif recherché (tokens, pos et lemmes)
            pattern_tokens = reconstituteString(words[start:end + 1])
            pattern_pos = " ".join(postags[start:end + 1])
            pattern_lemmas = " ".join(lemmas[start:end + 1])

            # récupération du contexte droit (tokens, pos et lemmes)
            for rp in right_context:
                rc_tokens.append(words[rp])
                rc_pos.append(postags[rp])
                rc_lemmas.append(lemmas[rp])
            rc_tokens = reconstituteString(rc_tokens)
            rc_pos = " ".join(rc_pos)
            rc_lemmas = " ".join(rc_lemmas)

            # mise en forme ici pour ne pas ajouter du temps de traitement côté client
            result["hide_column"] = lc_tokens[::-1]
            result[
                "left_context"] = "<span title=\"" + lc_pos + "&#10;" + lc_lemmas + "\">" + lc_tokens + "</span>"
            result[
                "pattern"] = "<span title=\"" + pattern_pos + "&#10;" + pattern_lemmas + "\">" + pattern_tokens + "</span>"
            result[
                "right_context"] = "<span title=\"" + rc_pos + "&#10;" + rc_lemmas + "\">" + rc_tokens + "</span>"

            resultDep.append(result)

    # fermeture du processus CQP car sinon ne se ferme pas
    os.popen("kill -9 " + str(cqp.CQP_process.pid))

    return resultDep
示例#15
0
registry_dir='/usr/local/share/cwb/registry'

for corpus_name in ["DEWAC01"]: #+['DEWAC%02d'%(x,) for x in range(1,10)]:
    print corpus_name
    cqp=PyCQP_interface.CQP(bin='/usr/local/bin/cqp',options='-c -r '+registry_dir)
    cqp.Exec(corpus_name+";")
    cqp.Query('[word="ist|sind|war|waren|seid"] [pos="ADV|PPER"]* [word=".+[elr]n"] [pos="\$.|KON"];')
    cqp.Exec("sort Last by word;")
    rsize=int(cqp.Exec("size Last;"))
    results=cqp.Dump(first=0,last=rsize)
    cqp.Terminate()

    f=file(corpus_name+'_absentiv.txt','w')
    #f=sys.stdout
    corpus=Corpus(corpus_name,registry_dir=registry_dir);
    words=corpus.attribute("word","p")
    postags=corpus.attribute("pos","p")
    sentences=corpus.attribute("s","s")
    texts=corpus.attribute("text_id","s")
    for line in results:
        start=int(line[0])
        end=int(line[1])
        wn=words[end-1]
        posn=postags[end-1]
        if posn.startswith('VV'):
            s_bounds=sentences.find_pos(end-1)
            text_bounds=texts.find_pos(end-1)
            print >>f,"# %s"%(text_bounds[2],)
            print >>f,"%10d:"%(int(line[1]),),
            for pos in xrange(s_bounds[0],s_bounds[1]+1):
                if pos==end-1:
示例#16
0
def query():

    query = request.form["query"] + ";"
    query_result = []

    corpus_list = [("dep_1", query), ("dep_10", query), ("dep_11", query),
                   ("dep_12", query), ("dep_13", query), ("dep_14", query),
                   ("dep_15", query), ("dep_16", query), ("dep_17", query),
                   ("dep_18", query), ("dep_19", query), ("dep_2", query),
                   ("dep_21", query), ("dep_22", query), ("dep_23", query),
                   ("dep_24", query), ("dep_25", query), ("dep_26", query),
                   ("dep_27", query), ("dep_28", query), ("dep_29", query),
                   ("dep_2a", query), ("dep_2b", query), ("dep_3", query),
                   ("dep_30", query), ("dep_31", query), ("dep_32", query),
                   ("dep_33", query), ("dep_34", query), ("dep_35", query),
                   ("dep_36", query), ("dep_37", query), ("dep_38", query),
                   ("dep_39", query), ("dep_4", query), ("dep_40", query),
                   ("dep_41", query), ("dep_42", query), ("dep_43", query),
                   ("dep_44", query), ("dep_45", query), ("dep_46", query),
                   ("dep_47", query), ("dep_48", query), ("dep_49", query),
                   ("dep_5", query), ("dep_50", query), ("dep_51", query),
                   ("dep_52", query), ("dep_53", query), ("dep_54", query),
                   ("dep_55", query), ("dep_56", query), ("dep_57", query),
                   ("dep_58", query), ("dep_59", query), ("dep_6", query),
                   ("dep_60", query), ("dep_61", query), ("dep_62", query),
                   ("dep_63", query), ("dep_64", query), ("dep_65", query),
                   ("dep_66", query), ("dep_67", query), ("dep_68", query),
                   ("dep_69", query), ("dep_7", query), ("dep_70", query),
                   ("dep_71", query), ("dep_72", query), ("dep_73", query),
                   ("dep_74", query), ("dep_75", query), ("dep_76", query),
                   ("dep_77", query), ("dep_78", query), ("dep_79", query),
                   ("dep_8", query), ("dep_80", query), ("dep_81", query),
                   ("dep_82", query), ("dep_83", query), ("dep_84", query),
                   ("dep_85", query), ("dep_86", query), ("dep_87", query),
                   ("dep_88", query), ("dep_89", query), ("dep_9", query),
                   ("dep_90", query), ("dep_91", query), ("dep_92", query),
                   ("dep_93", query), ("dep_94", query), ("dep_95", query)]

    # Ici, autant de processus qu'indiqués en argument de Pool vont se partager les tâches (récupérer pour chaque département le résultat de la requête cqp)

    #start_time = datetime.datetime.now()

    try:
        pool = Pool(processes=None)
        query_result = pool.starmap(f, corpus_list)
    finally:
        pool.close()
        pool.join()

    if query_result[0] == False:
        return "Erreur de syntaxe"

    else:
        allResults = []

        freqParDepartement = defaultdict(int)
        # Construction d'un dataframe contenant la fréquence du motif recherché dans chaque département
        # récupération de l'ensemble des résultats dans une seule et même liste
        for depResult in query_result:
            for codeDep in depResult:
                freqParDepartement[codeDep] = depResult[codeDep][
                    "nbTotalResults"]
                if depResult[codeDep]["results"] != [['']]:
                    for result in depResult[codeDep]["results"]:
                        allResults.append({"dep": codeDep, "result": result})

        # calcul des spécificités
        freqParDepartementOrdered = OrderedDict(
            sorted(freqParDepartement.items(), key=lambda t: t[0]))
        df_queryFreq = pd.DataFrame(freqParDepartementOrdered,
                                    index=["freq"]).fillna(0)
        specif = specificities(df_queryFreq)

        resultsExtract = []
        registry_dir = "/usr/local/share/cwb/registry"
        # Récupération des contextes gauche/droit + mise en forme, pour un extrait des résultats seulement (200 tirés au hasard)
        allResults_shuffle = []
        random.shuffle(allResults)
        for i, dic in enumerate(allResults):
            if i < 200:
                dep = dic["dep"]
                if (re.match(r"^0\d$", dep)):
                    corpus_name = "dep_" + re.match(r"^0(\d)$",
                                                    dep).group(1).lower()
                else:
                    corpus_name = "dep_" + dep.lower()

                r = dic["result"]

                corpus = Corpus(corpus_name, registry_dir=registry_dir)

                # permettra de récupérer par la suite le token, la POS ou le lemme correspondant à la position indiquée
                words = corpus.attribute("word", "p")
                postags = corpus.attribute("pos", "p")
                lemmas = corpus.attribute("lemma", "p")

                sentences = corpus.attribute(b"text", "s")

                left_context = []
                right_context = []
                start = int(r[0])
                end = int(r[1])

                # Récupération de la position du début et de la fin du tweet dans lequel le motif a été trouvé
                s_bounds = sentences.find_pos(end)

                # récupération de la position des mots des contextes droit et gauche
                for pos in range(s_bounds[0], s_bounds[1] + 1):
                    if (pos < start):
                        left_context.append(pos)
                    if (pos > end):
                        right_context.append(pos)

                # Construction du dictionnaire qui contiendra les informations qui nous intéressent
                result = {
                    "dep": dep,
                    "hide_column": "",
                    "left_context": "",
                    "pattern": "",
                    "right_context": ""
                }

                lc_tokens = []
                lc_pos = []
                lc_lemmas = []
                rc_tokens = []
                rc_pos = []
                rc_lemmas = []

                # récupération du contexte gauche (tokens, pos et lemmes)
                for lp in left_context:
                    lc_tokens.append(words[lp])
                    lc_pos.append(postags[lp])
                    lc_lemmas.append(lemmas[lp])
                lc_tokens = reconstituteString(lc_tokens)
                lc_pos = " ".join(lc_pos)
                lc_lemmas = " ".join(lc_lemmas)

                # récupération du motif recherché (tokens, pos et lemmes)
                pattern_tokens = reconstituteString(words[start:end + 1])
                pattern_pos = " ".join(postags[start:end + 1])
                pattern_lemmas = " ".join(lemmas[start:end + 1])

                # récupération du contexte droit (tokens, pos et lemmes)
                for rp in right_context:
                    rc_tokens.append(words[rp])
                    rc_pos.append(postags[rp])
                    rc_lemmas.append(lemmas[rp])
                rc_tokens = reconstituteString(rc_tokens)
                rc_pos = " ".join(rc_pos)
                rc_lemmas = " ".join(rc_lemmas)

                # mise en forme ici pour ne pas ajouter du temps de traitement côté client
                result["hide_column"] = lc_tokens[::-1]
                result[
                    "left_context"] = "<span title=\"" + lc_pos + "&#10;" + lc_lemmas + "\">" + lc_tokens + "</span>"
                result[
                    "pattern"] = "<span title=\"" + pattern_pos + "&#10;" + pattern_lemmas + "\">" + pattern_tokens + "</span>"
                result[
                    "right_context"] = "<span title=\"" + rc_pos + "&#10;" + rc_lemmas + "\">" + rc_tokens + "</span>"

                resultsExtract.append(result)

        #print(datetime.datetime.now()-start_time)

        resultAndSpec = {}
        resultAndSpec["result"] = resultsExtract
        resultAndSpec["specif"] = specif
        resultAndSpec["nbResults"] = int(df_queryFreq.sum().sum())
        resultAndSpec["nbOccurrences"] = freqParDepartement
        resultAndSpec = ujson.dumps(resultAndSpec)

        return resultAndSpec
示例#17
0
    def find(self,
             token,
             begin_time=20140601,
             end_time=20150531,
             board_list=['Gossiping']):
        if begin_time:
            if not isinstance(begin_time, int):
                raise TypeError('"begin_time" should be an "int"')
        if end_time:
            if not isinstance(end_time, int):
                raise TypeError('"end_time" should be an "int"')
        if board_list:
            if not isinstance(board_list, list):
                raise TypeError('"board_list" should be a "list"')

        self.conclst = []
        registry_dir = '/usr/local/share/cwb/registry'
        cqp = PyCQP_interface.CQP(bin='/usr/local/bin/cqp',
                                  options='-c -r ' + registry_dir)
        cqp.Exec(self.corpus_name + ";")
        cqp.Query('[word="%s"];' % token)

        rsize = int(cqp.Exec("size Last;"))
        self.results = cqp.Dump(first=0, last=rsize)
        os.system('kill -9 $(pidof cqp)')
        if self.results == [['']]:
            return None

        corpus = Corpus(self.corpus_name, registry_dir=registry_dir)
        words = corpus.attribute("word", "p")

        ids = corpus.attribute("text_id", "s")
        boards = corpus.attribute("text_board", "s")
        ptimes = corpus.attribute("text_time", "s")
        for num, line in enumerate(self.results, 1):
            print num
            output = dict()
            start = int(line[0])
            end = int(line[1]) + 1

            # post_time filter
            ptime = ptimes.find_pos(start)[-1]
            if begin_time != None and end_time != None:
                if begin_time <= int(ptime) <= end_time:
                    pass
                else:
                    continue
            elif begin_time != None and end_time == None:
                if int(ptime) < begin_time:
                    continue

            elif begin_time == None and end_time != None:
                if int(ptime) > end_time:
                    continue

            # board_list filter
            board = boards.find_pos(start)[-1]

            if board_list:
                if board not in board_list:
                    continue

            lw = words[start - self.window_size:start]
            rw = words[end:end + self.window_size]
            qw = words[start:end]

            left = ' '.join(['%s' % word for word in lw])
            mid = ' '.join(['%s' % word for word in qw])
            right = ' '.join(['%s' % word for word in rw])

            mongoid = ids.find_pos(start)[-1]

            ptime = int(str(ptime)[:-2])
            print ptime
            self.freq_by_month[ptime] += 1

            output['conc'] = (left, mid, right)
            output['board'] = board
            output['post_time'] = ptime
            output['mongoid'] = mongoid

            self.conclst.append(output)
        if self.time_order == -1:
            rev = True
        elif self.time_order == 1:
            rev = False
        else:
            raise ValueError('time order should be either 1 or -1')
        self.conclst.sort(key=lambda x: x['post_time'], reverse=rev)
示例#18
0
文件: cwb.py 项目: dokempf/cwb-ccc
class Corpus:
    """Interface to CWB-indexed corpus.

    After initializing, the corpus class has ...

    ... the following attributes:
    .data_path
    .registry_path
    .cqp_bin
    .lib_path
    .corpus_name
    .subcorpus [None]
    .attributes_available
    .corpus_size

    ... the following initialized classes:
    .attributes
    .cache
    .counts

    ... the following methods:
    .__str__
    ._attributes_available
    .start_cqp
    .copy
    .cpos2patts                 # p-attributes
    .marginals                  # p-attributes
    .cpos2sid                   # s-attributes
    .show_nqr                   # subcorpora
    .dump_from_s_att            # creating dumps
    .dump_from_query            # creating dumps
    .dump2patt                  # working on dumps
    .dump2satt                  # working on dumps
    .dump2context               # working on dumps
    .query_s_att                # query alias
    .query                      # query alias

    """
    def __init__(self,
                 corpus_name,
                 lib_path=None,
                 cqp_bin='cqp',
                 registry_path='/usr/local/share/cwb/registry/',
                 data_path='/tmp/ccc-data/'):
        """Establish connection to CQP and corpus attributes, set paths, read
        library. Raises KeyError if corpus not in registry.

        :param str corpus_name: name of corpus in CWB registry
        :param str lib_path: /path/to/macros/and/wordlists/
        :param str cqp_bin: /path/to/cqp-binary
        :param str registry_path: /path/to/cwb/registry/
        :param str data_path: /path/to/data/and/cache/

        """

        # process data path
        if data_path is not None:
            if not data_path.endswith(corpus_name):
                data_path = os.path.join(data_path, corpus_name)
            self.data_path = data_path
            if not os.path.isdir(self.data_path):
                os.makedirs(self.data_path)
            cache_path = os.path.join(self.data_path, "CACHE")
        else:
            self.data_path = None
            cache_path = None

        # set registry path and cqp_bin
        self.registry_path = registry_path
        self.cqp_bin = cqp_bin

        # macros and wordlists
        self.lib_path = lib_path

        # init (sub-)corpus information
        self.corpus_name = corpus_name
        self.subcorpus = None

        # init attributes
        self.attributes = Attributes(self.corpus_name,
                                     registry_dir=self.registry_path)

        # get corpus size
        self.corpus_size = len(self.attributes.attribute('word', 'p'))

        # get available corpus attributes
        self.attributes_available = self._attributes_available()

        # init cache
        self.cache = Cache(cache_path)

        # init counts
        self.counts = Counts(self.corpus_name, self.registry_path)

    def __str__(self):
        """Method for printing.

        :return: corpus_name, corpus_size, data_path, subcorpus
        :rtype: str

        """

        return "\n".join([
            'a ccc.Corpus: "%s"' % self.corpus_name,
            "size        : %s" % str(self.corpus_size),
            "data        : %s" % str(self.data_path),
            "subcorpus   : %s" % str(self.subcorpus),
            "attributes  :",
            self.attributes_available.to_string(),
        ])

    def _attributes_available(self):
        """Get indexed p- and s-attributes. Will be run once when initializing
        the corpus.

        :return: attributes and annotation info
        :rtype: DataFrame

        """

        # use CQP's context descriptor
        cqp = self.start_cqp()
        cqp_ret = cqp.Exec('show cd;')
        cqp.__kill__()

        # read as dataframe
        attributes = read_csv(
            StringIO(cqp_ret),
            sep='\t',
            names=['type', 'attribute', 'annotation', 'active']).fillna(False)

        # post-process Boolean columns
        attributes['active'] = (attributes['active'] == "*")
        attributes['annotation'] = (attributes['annotation'] == '-V')

        return attributes

    def start_cqp(self):
        """Start CQP process.

        :return: CQP process
        :rtype: CQP

        """

        return start_cqp(self.cqp_bin, self.registry_path, self.data_path,
                         self.corpus_name, self.lib_path, self.subcorpus)

    def copy(self):
        """Get a fresh initialization of the corpus.

        :return: corpus
        :rtype: Corpus

        """

        return Corpus(self.corpus_name, self.lib_path, self.cqp_bin,
                      self.registry_path, self.data_path)

    ################
    # p-attributes #
    ################
    def cpos2patts(self, cpos, p_atts=['word'], ignore=True):
        """Retrieve p-attributes of corpus position.

        :param int cpos: corpus position to fill
        :param list p_atts: p-attribute(s) to fill position with
        :param bool ignore: whether to return (None, .*) for -1

        :return: p-attribute(s) at cpos
        :rtype: tuple

        """

        if cpos == -1 and ignore:
            token = [None] * len(p_atts)
        else:
            token = [
                self.attributes.attribute(p_att, 'p')[cpos] for p_att in p_atts
            ]

        return tuple(token)

    def marginals(self, items, p_att='word', flags=0, pattern=False):
        """Extract marginal frequencies for given unigrams or unigram patterns
        of a single p-attribute.  0 if not in corpus.  For
        combinations of p-attributes, see marginals_complex.

        :param list items: items to get marginals for
        :param str p_att: p-attribute to get frequencies for
        :param int flags: 1 = %c, 2 = %d, 3 = %cd (will activate wildcards)
        :param bool pattern: activate wildcards?

        :return: frequencies of the items in the whole corpus indexed by items
        :rtype: FreqFrame

        """

        pattern = True if flags > 0 else pattern

        tokens_all = self.attributes.attribute(p_att, 'p')

        # loop through items and collect frequencies
        counts = list()
        for item in items:
            if not pattern:
                try:
                    counts.append(tokens_all.frequency(item))
                except KeyError:
                    counts.append(0)
            else:
                cpos = tokens_all.find_pattern(item, flags=flags)
                counts.append(len(cpos))

        # create dataframe
        df = DataFrame({'freq': counts, p_att: items})
        df = df.set_index(p_att)

        return df

    def marginals_complex(self, items, p_atts=['word']):
        """Extract marginal frequencies for p-attribute combinations,
        e.g. ["word", "lemma"].  0 if not in corpus.  Marginals are
        retrieved using cwb-scan-corpus, result is cached.

        :param list items: list of tuples
        :param list p_atts: list of p-attributes

        :return: counts of the items in the whole corpus indexed by items
        :rtype: FreqFrame

        """

        # retrieve all marginals for p-att combination from cache if possible
        identifier = "_".join(p_atts) + "_marginals"
        df = self.cache.get(identifier)
        if df is not None:
            logger.info('using cached version of marginals of "%s"' %
                        "_".join(p_atts))
        else:
            # calculate all marginals for p-att combination
            df = cwb_scan_corpus(None, self.corpus_name, self.registry_path,
                                 p_atts)
            self.cache.set(identifier, df)

        # select relevant rows
        df = df.reindex(items)
        df = df.fillna(0, downcast='infer')
        return df

    ################
    # s-attributes #
    ################
    def cpos2sid(self, cpos, s_att):
        """Get cwb-id of s-att at cpos. -1 if not present.

        :param int cpos: corpus position
        :param str s_atts: s-attribute to get cwb-id for

        :return: cwb-id
        :rtype: int

        """
        s_attributes = self.attributes.attribute(s_att, "s")
        try:
            return s_attributes.cpos2struc(cpos)
        except KeyError:
            return -1

    ##############
    # subcorpora #
    ##############
    def show_nqr(self):
        """Get subcorpora defined in CQP as DataFrame.

        :return: available subcorpora
        :rtype: DataFrame

        """
        cqp = self.start_cqp()
        cqp_return = cqp.Exec("show named;")
        try:
            df = read_csv(StringIO(cqp_return), sep="\t", header=None)
            df.columns = ["storage", "corpus:subcorpus", "size"]
            crpssbcrps = df["corpus:subcorpus"].str.split(":", 1).str
            df['corpus'] = crpssbcrps[0]
            df['subcorpus'] = crpssbcrps[1]
            df.drop('corpus:subcorpus', axis=1, inplace=True)
            df = df[['corpus', 'subcorpus', 'size', 'storage']]
        except EmptyDataError:
            logger.info("no subcorpora defined")
            df = DataFrame()

        cqp.__kill__()
        return df

    def activate_subcorpus(self, nqr=None, df_dump=None):
        """Activate subcorpus.  If no df_dump is given, this sets
        self.subcorpus and logs an error if subcorpus not defined.  If
        a df_dump is given, the df_dump will be undumped.

        :param str subcorpus: subcorpus name defined in CQP
        :param DataFrame df_dump: DataFrame indexed by (match, matchend)
                                  with optional columns 'target' and 'keyword'

        """

        if df_dump is not None:
            cqp = self.start_cqp()
            cqp.nqr_from_dump(df_dump, nqr)
            cqp.nqr_save(self.corpus_name, nqr)
            cqp.__kill__()

        if nqr is not None:
            # raise an error if subcorpus not available
            if nqr not in self.show_nqr()['subcorpus'].values:
                logger.error('subcorpus "%s" not defined)' % nqr)
                self.activate_subcorpus()
            else:
                logger.info('switched to subcorpus "%s"' % nqr)
        else:
            logger.info('switched to corpus "%s"' % self.corpus_name)

        # activate subcorpus
        self.subcorpus = nqr

    ##################
    # CREATING DUMPS #
    ##################
    def dump_from_s_att(self, s_att, annotation=True):
        """Create s-attribute spans as DataFrame of corpus positions.
        Resulting df_dump is indexed by (match, matchend).

        Note that s-attribute values are not indexed by the CWB.
        CWB.CL implementation just iterates over all annotations.

        This method thus creates a dataframe with the
        (non-overlapping) spans encoded as matches

        === (match, matchend) $s_cwbid, $s* ===

        and caches the result.

        $s is only created if annotation is True and attribute is
        actually annotated.

        :param str s_att: s-attribute to get spans and annotation for
        :param bool annotation: whether to retrieve annotation (if present)

        :return: df_dump
        :rtype: DataFrame

        """

        # retrieve from cache if possible
        identifier = s_att + "_spans"
        df = self.cache.get(identifier)
        if df is not None:
            logger.info('using cached version of spans of "%s"' % s_att)
            return df

        # compute
        logger.info('creating dataframe of spans of "%s"' % s_att)

        df = DataFrame(list(self.attributes.attribute(s_att, 's')))

        # two or three columns: 0 (start), 1 (end), 2* (annotation)
        if annotation:
            annotation = (2 in df.columns)  # just to make sure ...
            if not annotation:
                logger.info('s-att "%s" does not have any annotation' % s_att)
            else:
                df[2] = df[2].apply(lambda x: x.decode('utf-8'))

        # post-process
        df = df.reset_index()
        df = df.rename(
            {
                'index': s_att + '_cwbid',
                0: 'match',
                1: 'matchend',
                2: s_att
            },
            axis=1)
        df = df.set_index(['match', 'matchend'])

        # put into cache
        self.cache.set(identifier, df)

        return df

    def dump_from_query(self,
                        query,
                        s_query=None,
                        anchors=[],
                        match_strategy='standard',
                        name='Last',
                        save=False):
        """Execute query, get DataFrame of corpus positions (CWB dump).
        Resulting df_dump is indexed by (match, matchend).

        Note that in the CWB, only two anchors can be active
        simultaneously. The method thus runs the query once with
        anchors set to [0, 1], and then collects the remaining anchor
        points by running the query again on the NQR of the first
        query run for each pair of remaining anchor points.  Optional
        columns for each anchor:

        === (match, matchend), 0*, ..., 9* ===

        The result is cached.

        :param str query: valid CQP query (without 'within' clause)
        :param str s_query: s-attribute used for initial query
        :param list anchors: anchors to search for
        :param str match_strategy: CQP matching strategy
        :param str name: name for NQR
        :param bool save: whether to save NQR to disk

        :return: df_dump
        :rtype: DataFrame

        """

        # identify query
        if self.subcorpus is not None:
            # check subcorpus size to avoid confusion when re-naming
            cqp = self.start_cqp()
            sbcrpssize = cqp.Exec("size %s" % self.subcorpus)
            cqp.__kill__()
        else:
            sbcrpssize = None
        identifier = self.cache.generate_idx([
            query, s_query, anchors, match_strategy, self.subcorpus, sbcrpssize
        ],
                                             prefix="df_dump:")

        # retrieve from cache if possible
        df_dump = self.cache.get(identifier)
        if df_dump is not None:
            logger.info(
                'using cached version "%s" of df_dump with %d matches' %
                (identifier, len(df_dump)))
            return df_dump

        # init cqp and set matching strategy
        cqp = self.start_cqp()
        cqp.Exec('set MatchingStrategy "%s";' % match_strategy)

        # include optional within clause
        if s_query is None:
            start_query = query
        else:
            start_query = query + ' within ' + s_query

        # first run: anchors at 0 and 1 (considering within clause)
        logger.info("running CQP query")
        cqp.Exec('set ant 0; ank 1;')
        df_dump = cqp.nqr_from_query(query=start_query,
                                     name=name,
                                     match_strategy=match_strategy,
                                     return_dump=True)
        df_dump.columns = [0, 1]
        logger.info("found %d matches" % len(df_dump))

        # if there's nothing to return ...
        if df_dump.empty:
            cqp.__kill__()
            return df_dump

        # join all other anchors
        remaining_anchors = list(chunk_anchors(anchors, 2))
        if len(remaining_anchors) > 0:

            # restrict subsequent queries on initial matches
            cqp.nqr_activate(self.corpus_name, name)

            for pair in remaining_anchors:
                logger.info(".. running query for anchor(s) %s" % str(pair))
                # set appropriate anchors
                cqp.Exec('set ant %d;' % pair[0])
                if len(pair) == 2:
                    cqp.Exec('set ank %d;' % pair[1])
                else:
                    cqp.Exec('set ank %d;' % 1)
                # dump new anchors
                cqp.Query('tmp = <match> ( %s );' % query)
                df = cqp.Dump("tmp")
                # select columns and join to global df
                if len(pair) == 2:
                    df.columns = [pair[0], pair[1]]
                else:
                    df.columns = [pair[0], 1]
                    df = df.drop(1, axis=1)
                df_dump = df_dump.join(df)

            # NA handling
            logger.info("post-processing dataframe")
            df_dump = df_dump.dropna(axis=1, how='all')
            df_dump = df_dump.fillna(-1, downcast='integer')

        # restrict output to requested anchors
        df_dump = df_dump[anchors]

        # put into cache
        self.cache.set(identifier, df_dump)

        if save:
            cqp.nqr_save(self.corpus_name, name)

        cqp.__kill__()

        return df_dump

    #################################################
    # WORKING ON DUMPS ##############################
    #################################################
    def _dump2patt_row(self, row, p_att, start, end):
        """Retrieve p-attribute annotation from start to end of one row.

        :param Series row: dataframe row that contain start and end keys
        :param str p_att: p-attribute to retrieve
        :param str start: key of start column (int or str)
        :param str end: key of end column (int or str)

        :return: p-attribute annotation
        :rtype: str
        """

        cpos_start = row[start]
        cpos_end = row[end]

        # if both are missing, return empty string
        if cpos_start == cpos_end == -1:
            return ""
        # if one of them is missing, set start = end or end = start
        if cpos_start == -1:
            cpos_start = cpos_end
        if cpos_end == -1:
            cpos_end = cpos_start

        # lexicalize
        p = self.attributes.attribute(p_att, 'p')
        return " ".join(p[int(cpos_start):int(cpos_end) + 1])

    def dump2patt(self, df_dump, p_att='word', start='match', end='matchend'):
        """Retrieve p-attribute annotation from start to end.

        === (match, matchend), $p ===

        Any additional columns of df_dump are preserved as long as
        there are no conflicts (in which case the original column will
        be overwritten).

        :param DataFrame df_dump: DataFrame with specified columns (possibly as index)
        :param str p_att: p-attribute to retrieve
        :param str start: key of start column (int or str)
        :param str end: key of end column (int or str)

        :return: df_dump
        :rtype: DataFrame

        """

        # pre-process
        index_names = df_dump.index.names
        df = df_dump.reset_index()

        # retrieve attribute
        df[p_att] = df.apply(
            lambda row: self._dump2patt_row(row, p_att, start, end), axis=1)

        # post-process
        df = df.set_index(index_names)

        return df

    def dump2satt(self, df_dump, s_att, annotation=True):
        """Retrieve cwb-id, span, and annotation of s-attribute at match.

        Note that this only takes into account the match, not the
        matchend. This is reasonable assuming that the retrieved s-att
        comprises complete matches (match..matchend).

        === (match, matchend), $s_cwbid, $s_span, $s_spanend, $s*  ===

        $s is only created if annotation is True and attribute is
        actually annotated.

        Any additional columns of df_dump are preserved as long as
        there are no conflicts (in which case the original columns
        will be overwritten).

        :param DataFrame df_dump: DataFrame indexed by (match, matchend)
        :param str s_att: s-attribute to retrieve
        :param bool annotation: whether to retrieve annotation of s-att

        :return: df_dump
        :rtype: DataFrame

        """

        # init dataframe
        df = df_dump.reset_index()[['match', 'matchend']]

        # get $s_id
        df[s_att +
           "_cwbid"] = df.match.apply(lambda x: self.cpos2sid(x, s_att))
        nr_missing = (df[s_att + '_cwbid'] == -1).sum()
        logger.info('s-att "%s" exists at %d of %d matches' %
                    (s_att, len(df) - nr_missing, len(df)))

        # retrieve where possible
        s_attributes = self.attributes.attribute(s_att, "s")
        df_s = df.loc[~(df[s_att + "_cwbid"] == -1)]
        df = df.join(
            DataFrame(index=df_s.index,
                      data=df_s[s_att + "_cwbid"].apply(
                          lambda x: s_attributes[int(x)]).to_list()))

        # two or three columns: 0 (start), 1 (end), 2* (annotation)
        if annotation:
            annotation = (2 in df.columns)  # just to make sure ...
            if not annotation:
                logger.info('s-att "%s" does not have any annotation' % s_att)
            else:
                df[2] = df[2].apply(lambda x: x.decode('utf-8'))

        # restore original index and post-process
        df = df.set_index(['match', 'matchend'])
        df = df.rename({
            0: s_att + '_span',
            1: s_att + '_spanend',
            2: s_att
        },
                       axis=1)

        # join to original dataframe
        df_dump = df_dump.join(df, lsuffix='_bak')
        df_dump = df_dump[[
            col for col in df_dump if not str(col).endswith('_bak')
        ]]
        df_dump[[s_att + '_span', s_att + '_spanend'
                 ]] = df[[s_att + '_span',
                          s_att + '_spanend']].fillna(-1, downcast='infer')

        return df_dump

    def dump2context(self, df_dump, context_left, context_right,
                     context_break):
        """Extend df_dump to context, breaking the context at context_break.

        === (match, matchend), contextid*, context, contextend  ===

        Columns for $context_break, $context_break_cwbid,
        $context_break_span, $context_break_spanend, are also created
        (if applicable).

        Any additional columns of df_dump are preserved.

        Note that in contrast to matches, contexts may overlap.

        For positions where the s-att specified by context_break does
        not exist, context_break is ignored.

        The context creation algorithm does not take into account that
        match and matchend may be part of different spans defined by
        context_break; it only looks at the s-attributes of match, not
        of matchend.

        For the _context_ column (left hand side), the strategy is as
        follows; the strategy for _contextend_ (right hand side) is
        analogous (using context_right, matchend and
        context_break_spanend).
        if context_break_span is None and context_left is None
            => context = match
        if context_break_span is None and context_left is not None
            => context = max(0, match - context_left)
        if context_break_span is not None and context_left is None
            => context = context_break_span
        if context_break_span is not None and context_left is not None
            => context = max(match - context_left, s_start)

        :param DataFrame df_dump: DataFrame indexed by (match, matchend)
        :param int context_left: maximum context to the left of match
        :param int context_right: maximum context to the right of matchend
        :param str context_break: s-attribute to confine context

        """

        if context_break is None:
            df = df_dump.reset_index()
            # left
            if context_left is None:
                df['context'] = df.match
            else:
                df['context'] = maximum(0, df.match - context_left)
            # right
            if context_right is None:
                df['contextend'] = df.matchend
            else:
                df['contextend'] = minimum(self.corpus_size - 1,
                                           df.matchend + context_right)
        else:
            # get context confined by s-attribute
            df = self.dump2satt(df_dump, context_break,
                                annotation=False).reset_index()

            # save contextid
            df['contextid'] = df[context_break + '_cwbid']

            # replace -1 to not confuse min()
            df[[context_break + '_spanend'
                ]] = df[[context_break + '_spanend'
                         ]].replace(-1, self.corpus_size + 1)

            # left
            if context_left is None:
                df['context'] = df[context_break + '_span']
            else:
                df['context'] = df.match - context_left
                df['context'] = df[['context',
                                    context_break + '_span']].max(axis=1)
            # right
            if context_right is None:
                df['contextend'] = df[context_break + '_spanend']
            else:
                df['contextend'] = df.matchend + context_right
                df['contextend'] = df[[
                    'contextend', context_break + '_spanend'
                ]].min(axis=1)

            # revert replacement
            df[[context_break + '_spanend'
                ]] = df[[context_break + '_spanend'
                         ]].replace(self.corpus_size + 1, -1)

        # restore original index
        df = df.set_index(['match', 'matchend'])

        return df

    #################################################
    # QUERY ALIASES #################################
    #################################################
    def query_s_att(self, s_att, values=set(), name=None):
        """Get s-attribute spans as Dump, optionally restricting the spans by
        matching the provided values against the s-att annotations.

        === (match, matchend), $s_cwbid, $s* ===

        :param str s_att: s-attribute to use for spans
        :param set values: values of s-att annotation to restrict spans to

        :return: dump
        :rtype: Dump

        """

        df_spans = self.dump_from_s_att(s_att)

        # restrict to certain values
        if len(values) > 0:
            if s_att not in df_spans.columns:
                logger.error("cannot restrict spans without annotation")
                df_spans = DataFrame(columns=['match', 'matchend']).set_index(
                    ['match', 'matchend'])
            else:
                values = set(values)
                logger.info("restricting spans using %d values" % len(values))
                df_spans = df_spans.loc[df_spans[s_att].isin(values)]

        # save as NQR
        if name is not None:
            # undump the dump and save to disk
            cqp = self.start_cqp()
            cqp.nqr_from_dump(df_spans, name)
            cqp.nqr_save(self.corpus_name, name)
            cqp.__kill__()

        # return proper Dump
        return Dump(self.copy(), df_spans, name_cqp=None)

    def query(self,
              cqp_query,
              context=20,
              context_left=None,
              context_right=None,
              context_break=None,
              corrections=dict(),
              match_strategy='standard',
              name=None):
        """Get query result as (context-extended) Dump (with corrected
        anchors). If a name is given, the resulting NQR (without
        context and before anchor correction) will be written to disk
        in CWB binary format.

        === (match, matchend), 0*, ..., 9*, contextid*, context*, contextend* ===

        :param str query: CQP query
        :param int context: maximum context around match..matchend (symmetric)
        :param int context_left: maximum context left to the match
        :param int context_right: maximum context right to the matchend
        :param str context_break: s-attribute to confine context to
        :param dict corrections: corrections {anchor: offset}
        :param str match_strategy: CQP matching strategy
        :param str name: name for NQR

        :return: dump
        :rtype: Dump

        """

        # preprocess input
        save = False if name is None else True  # save NQR from CQP to disk?
        name = 'Last' if name is None else name  # name in CQP
        query, s_query, anchors = preprocess_query(cqp_query)
        s_query = context_break if s_query is None else s_query
        context_left = context if context_left is None else context_left
        context_right = context if context_right is None else context_right

        # get dump from query
        df_dump = self.dump_from_query(query=query,
                                       s_query=s_query,
                                       anchors=anchors,
                                       match_strategy=match_strategy,
                                       name=name,
                                       save=save)

        # if dump has been retrieved from cache, NQR might not exist
        if self.show_nqr().empty or \
           name not in self.show_nqr()['subcorpus'].values:
            # undump the dump and save to disk
            cqp = self.start_cqp()
            cqp.nqr_from_dump(df_dump, name)
            cqp.nqr_save(self.corpus_name, name)
            cqp.__kill__()

        # empty return?
        if len(df_dump) == 0:
            logger.warning("found 0 matches")
            df_dump = DataFrame(columns=['match', 'matchend']).set_index(
                ['match', 'matchend'])
        else:
            # extend dump to context
            df_dump = self.dump2context(df_dump, context_left, context_right,
                                        context_break)
            # apply corrections to anchor points
            df_dump = correct_anchors(df_dump, corrections)

        # return proper dump
        return Dump(self.copy(), df_dump, name_cqp=name)