Exemplo n.º 1
0
    def active_features(self, verbose=1):

        # XXX: We probably don't want to do this here. However, we *do* run it
        # here so that the active set is guaranteed to be the right size.
        if self.group_budget is not None:
            self.dense.prox_budget(self.group_budget)

        active = [c for c in self.C if self.dense.w[self.context_feature_id(c)] != 0]

        #assert len(active) == np.sum(w != 0), 'active %s, nonzero %s' % (len(active), np.sum(w != 0))
        #self.check_L0_group_norm_proxy(self.dense)

        if verbose:
            print('%s: %s out of %s' % (colors.yellow % 'active', len(active), len(self.C)), end=' ')
            B = groupby2(active, len)
            print('(budget %s, sizes %s)' % (self.group_budget,
                                             ', '.join('%s: %s' % (z, len(B[z])) for z in sorted(B))))

        return active
Exemplo n.º 2
0
    def add_features(self):

        items = self.pages[0].items

        # above/below abstract
        abstracts = [x for x in items if x.abstract]
        if len(abstracts) == 1:
            [abstract] = abstracts
            for x in items:
                x.attributes['above-abstract'] = x.yoffset < abstract.yoffset
        else:
            # TODO: handle no abstracts or many abstracts
            pass

        # extract local features
        for page in self.pages:
            for x in page.items:
                feature_extraction(x)

        # fontsize frequency
        fontsize = Counter(x.fontsize for x in items)
        freq = zip(fontsize.values(), fontsize.keys())
        freq.sort(reverse=True)
        rank = {k: rank + 1 for rank, (v, k) in enumerate(freq)}
        for x in items:
            x.attributes['fontsize-freq-rank'] = rank[x.fontsize]

        # width frequency
        w = Counter(int(x.width) for x in items)
        freq = zip(w.values(), w.keys())
        freq.sort(reverse=True)
        rank = {k: rank + 1 for rank, (v, k) in enumerate(freq)}
        for x in items:
            x.attributes['width-rank'] = rank[int(x.width)]

        # fontsize rank
        fontsize = groupby2(items, lambda x: x.fontsize)
        for rank, (_, vs) in enumerate(reversed(sorted(fontsize.items()))):
            for v in vs:
                v.attributes['fontsize-size-rank'] = rank + 1
Exemplo n.º 3
0
    def add_features(self):

        items = self.pages[0].items

        # above/below abstract
        abstracts = [x for x in items if x.abstract]
        if len(abstracts) == 1:
            [abstract] = abstracts
            for x in items:
                x.attributes['above-abstract'] = x.yoffset < abstract.yoffset
        else:
            # TODO: handle no abstracts or many abstracts
            pass

        # extract local features
        for page in self.pages:
            for x in page.items:
                feature_extraction(x)

        # fontsize frequency
        fontsize = Counter(x.fontsize for x in items)
        freq = zip(fontsize.values(), fontsize.keys())
        freq.sort(reverse=True)
        rank = {k: rank + 1 for rank, (v, k) in enumerate(freq)}
        for x in items:
            x.attributes['fontsize-freq-rank'] = rank[x.fontsize]

        # width frequency
        w = Counter(int(x.width) for x in items)
        freq = zip(w.values(), w.keys())
        freq.sort(reverse=True)
        rank = {k: rank + 1 for rank, (v, k) in enumerate(freq)}
        for x in items:
            x.attributes['width-rank'] = rank[int(x.width)]

        # fontsize rank
        fontsize = groupby2(items, lambda x: x.fontsize)
        for rank, (_, vs) in enumerate(reversed(sorted(fontsize.items()))):
            for v in vs:
                v.attributes['fontsize-size-rank'] = rank + 1
Exemplo n.º 4
0
def extract_title(filename, extra=True):

    EXPERIMENTAL_AUTHOR_EXTRACTION = 1
    if EXPERIMENTAL_AUTHOR_EXTRACTION:
        A = authors_set()

    if not isinstance(filename, basestring):
        pdf = filename
        filename = pdf.filename
    else:
        filename = re.sub('^file://', '', filename)
        try:
            pdf = pdfminer(filename)
        except KeyboardInterrupt:
            raise
        except:
            return

    # check for skid-mark


#    if os.path.exists(filename + '.d/notes.org'):
#        from skid.add import Document
#        d = Document(filename)
#        meta = d.parse_notes()
#        print meta.get(u'title', None)
#        print meta.get(u'author', None)

    if not pdf:
        return

    page = pdf.pages[0].items

    # preprocessing
    page = [
        x for x in page
        # Need to find a three+ letter word begining with a capital letter to be
        # considered a candidate for author or title.
        if re.findall('[A-Z][A-Za-z][A-Za-z]+', x.text)
    ]

    # Capitalization filter: Titles (almost) always have at least one
    # capitalized three-letter word.
    #
    #  - TODO: discards multiline titles where the second line doesn't have any
    #    capitalized words.

    # TODO: Other observations to take advantage of: Titles tend not to have
    # single initial, unlike names, (both title and author precede the word
    # "abstract")

    g = groupby2(page, key=lambda x: x.fontsize)

    if not g:
        return

    title = ' '.join(x.text for x in g[max(g)])

    # Clean up case if all caps
    if title.isupper():
        title = title.title()

    print yellow % title.encode('utf8')

    if extra:

        # timv: this is sort of a proxy for author extraction. If it's easy to
        # copy-paste the authors maybe we don't need to have automatic
        # extraction.
        #
        #  - authors often appear in a distinguishing (infrequent) font.
        #
        #  - text of the document should be the most-frequent font (Although,
        #    sometimes the authors aren't in a distinguished font).
        #
        g = groupby2(page, key=lambda x: x.fontname)

        freq = [(len(v), k, v) for k, v in g.iteritems()]

        freq.sort()
        for count, key, items in freq:
            print
            print red % count, green % key
            for x in items[:15]:
                x = x.text.encode('utf8')

                if EXPERIMENTAL_AUTHOR_EXTRACTION:
                    # similarity to existing list of authors
                    aa = [(sim(a, simplify(x), n=3), a) for a in A]
                    aa = [(s, a) for s, a in aa if s > 0.2]
                    aa.sort(reverse=1)
                    print yellow % x, ('%s %s' %
                                       (red % '->', aa[:5])) if aa else ''
                else:
                    print yellow % x

        extract_year(freq)

    return title
Exemplo n.º 5
0
def _main(args):
    with timeit('load data'):
        corpus = CoNLL_U('data/UD/{lang}/UD_{lang}'.format(lang=args.lang),
                         tag_type=args.tag_type)

    if args.quick:
        corpus.train = corpus.train[:100]
        corpus.dev = corpus.train[:0]

    allowed_contexts = None
    if args.context_count is not None:
        print 'context count filter threshold %s' % args.context_count

        max_order = args.initial_order + args.outer_iterations,
        if args.max_order is not None:
            max_order = args.max_order

        allowed_contexts = contexts_by_count(corpus, max_order,
                                             args.context_count)
        print 'allowed_contexts:', len(allowed_contexts)

        B = groupby2(allowed_contexts, len)
        print '(sizes %s)' % (', '.join('%s: %s' % (z, len(B[z]))
                                        for z in sorted(B)))

        if 0:
            # things that survived the threshold.
            for k, v in B.items():
                if k >= 10:  # context size >= 10
                    print
                    print k
                    for vv in v:
                        print '-'.join(vv)
            pl.plot(B.keys(), map(len, B.values()))
            pl.show()

        if 0:
            max_order = args.outer_iterations
            C = {}
            for n in xrange(1, max_order + 1):  # initial order + num iters
                C.update(corpus.tag_ngram_counts(n=n))
            pl.scatter(map(len, C.keys()), C.values(), lw=0, alpha=0.5)
            pl.show()

    elif args.max_order is not None:
        allowed_contexts = prefix_closure(
            fixed_order_contexts(corpus.Y, order=args.max_order))
        print 'allowed_contexts:', len(allowed_contexts)

    A = ActiveSet(corpus,
                  Y=corpus.Y,
                  train=corpus.make_instances('train', Instance),
                  dev=corpus.make_instances('dev', Instance),
                  group_budget=args.budget,
                  regularizer=args.C,
                  outer_iterations=args.outer_iterations,
                  inner_iterations=args.inner_iterations,
                  initial_contexts=fixed_order_contexts(
                      corpus.Y, args.initial_order),
                  allowed_contexts=allowed_contexts,
                  no_failure_arcs=args.baseline,
                  dump=args.dump)

    A.active_set()
Exemplo n.º 6
0
def extract_title(filename, extra=True):

    EXPERIMENTAL_AUTHOR_EXTRACTION = 1
    if EXPERIMENTAL_AUTHOR_EXTRACTION:
        A = authors_set()

    if not isinstance(filename, basestring):
        pdf = filename
        filename = pdf.filename
    else:
        filename = re.sub('^file://', '', filename)
        try:
            pdf = pdfminer(filename)
        except KeyboardInterrupt:
            raise
        except:
            return

    # check for skid-mark
#    if os.path.exists(filename + '.d/notes.org'):
#        from skid.add import Document
#        d = Document(filename)
#        meta = d.parse_notes()
#        print meta.get(u'title', None)
#        print meta.get(u'author', None)

    if not pdf:
        return

    page = pdf.pages[0].items

    # preprocessing
    page = [x for x in page
            # Need to find a three+ letter word begining with a capital letter to be
            # considered a candidate for author or title.
            if re.findall('[A-Z][A-Za-z][A-Za-z]+', x.text)]

    # Capitalization filter: Titles (almost) always have at least one
    # capitalized three-letter word.
    #
    #  - TODO: discards multiline titles where the second line doesn't have any
    #    capitalized words.

    # TODO: Other observations to take advantage of: Titles tend not to have
    # single initial, unlike names, (both title and author precede the word
    # "abstract")

    g = groupby2(page, key=lambda x: x.fontsize)

    if not g:
        return

    title = ' '.join(x.text for x in g[max(g)])

    # Clean up case if all caps
    if title.isupper():
        title = title.title()

    print yellow % title.encode('utf8')

    if extra:

        # timv: this is sort of a proxy for author extraction. If it's easy to
        # copy-paste the authors maybe we don't need to have automatic
        # extraction.
        #
        #  - authors often appear in a distinguishing (infrequent) font.
        #
        #  - text of the document should be the most-frequent font (Although,
        #    sometimes the authors aren't in a distinguished font).
        #
        g = groupby2(page, key=lambda x: x.fontname)

        freq = [(len(v), k, v) for k,v in g.iteritems()]

        freq.sort()
        for count, key, items in freq:
            print
            print red % count, green % key
            for x in items[:15]:
                x = x.text.encode('utf8')

                if EXPERIMENTAL_AUTHOR_EXTRACTION:
                    # similarity to existing list of authors
                    aa = [(sim(a, simplify(x), n=3), a) for a in A]
                    aa = [(s, a) for s, a in aa if s > 0.2]
                    aa.sort(reverse=1)
                    print yellow % x, ('%s %s' % (red % '->', aa[:5])) if aa else ''
                else:
                    print yellow % x


        extract_year(freq)

    return title
Exemplo n.º 7
0
def extract_title(filename, extra=True):

    if not isinstance(filename, basestring):
        pdf = filename
        filename = pdf.filename
    else:
        try:
            pdf = pdfminer(filename)
        except KeyboardInterrupt:
            raise
        except:
            return

    # check for skid-mark
#    if os.path.exists(filename + '.d/notes.org'):
#        from skid.add import Document
#        d = Document(filename)
#        meta = d.parse_notes()
#        print meta.get(u'title', None)
#        print meta.get(u'author', None)

    if not pdf:
        return

    page = pdf.pages[0].items

    # preprocessing
    page = [x for x in page
            # Need to find a three+ letter word begining with a capital letter to be
            # considered a candidate for author or title.
            if re.findall('[A-Z][A-Za-z][A-Za-z]+', x.text)]

    # TODO: titles tend not to have single initial, unlike names, (both title
    # and author precede the word "abstract")

    g = groupby2(page, key=lambda x: x.fontsize)

    if not g:
        return

    title = ' '.join(x.text for x in g[max(g)])

    # Clean up case if all caps
    if title.isupper():
        title = title.title()

    print yellow % title.encode('utf8')

    if extra:

        # timv: this is sort of a proxy for author extraction. If it's easy to
        # copy-paste the authors maybe we don't need to have automatic extraction.
        #
        # authors often appear in a distinguishing (infrequent) font
        g = groupby2(page, key=lambda x: x.fontname)

        freq = [(len(v), k, v) for k,v in g.iteritems()]

        freq.sort()

        for count, key, items in freq:
            print
            print red % count, green % key
            for x in items[:10]:
                print yellow % x.text.encode('utf8')

    return title