示例#1
0
 def get_Corpus(self, corpname, subcname=''):
     if ':' in corpname:
         corpname, subcname = corpname.split(':', 1)
     corp = manatee.Corpus(corpname)
     corp.corpname = str(corpname)  # never unicode (paths)
     corp.cm = self
     dsubcpath = self.default_subcpath(corp)
     if subcname:
         for sp in self.subcpath + [dsubcpath]:
             if sp == dsubcpath:
                 spath = os.path.join(sp, subcname + '.subc')
             else:
                 spath = os.path.join(sp, corpname, subcname + '.subc')
             if type(spath) == unicode:
                 spath = spath.encode("utf-8")
             if os.path.isfile(spath):
                 subc = manatee.SubCorpus(corp, spath)
                 subc.corp = corp
                 subc.spath = spath
                 try:
                     open(spath[:-4] + 'used', 'w')
                 except Exception:
                     pass
                 subc.corpname = str(corpname)  # never unicode (paths)
                 subc.subcname = subcname
                 subc.cm = self
                 subc.subchash = md5(open(spath).read()).hexdigest()
                 subc.created = datetime.fromtimestamp(int(os.path.getctime(spath)))
                 return subc
         raise RuntimeError(_('Subcorpus "%s" not found') % subcname)
     else:
         return corp
示例#2
0
def _get_cached_conc(corp, subchash, q, pid_dir, minsize):
    """
    Loads a concordance from cache
    """
    start_time = time.time()
    q = tuple(q)
    if not os.path.isdir(pid_dir):
        os.makedirs(pid_dir, mode=0o775)

    cache_map = plugins.get('conc_cache').get_mapping(corp)
    cache_map.refresh_map()
    if _contains_shuffle_seq(q):
        srch_from = 1
    else:
        srch_from = len(q)

    ans = (0, None)
    for i in range(srch_from, 0, -1):
        cachefile = cache_map.cache_file_path(subchash, q[:i])
        if cachefile:
            pidfile = cache_map.get_stored_pidfile(subchash, q[:i])
            _wait_for_conc(corp=corp,
                           q=q,
                           subchash=subchash,
                           cachefile=cachefile,
                           cache_map=cache_map,
                           pidfile=pidfile,
                           minsize=minsize)
            if not os.path.exists(cachefile):  # broken cache
                cache_map.del_entry(subchash, q)
                try:
                    os.remove(pidfile)
                except OSError:
                    pass
                continue
            conccorp = corp
            for qq in reversed(q[:i]):  # find the right main corp, if aligned
                if qq.startswith('x-'):
                    conccorp = manatee.Corpus(qq[2:])
                    break
            conc = PyConc(conccorp, 'l', cachefile, orig_corp=corp)
            if not _is_conc_alive(pidfile, minsize) and not conc.finished():
                # unfinished and dead concordance
                cache_map.del_entry(subchash, q)
                try:
                    os.remove(cachefile)
                except OSError:
                    pass
                try:
                    os.remove(pidfile)
                except OSError:
                    pass
                continue
            ans = (i, conc)
            break
    logging.getLogger(__name__).debug(
        'get_cached_conc(%s, [%s]) -> %s, %01.4f' %
        (corp.corpname, ','.join(q), 'hit' if ans[1] else 'miss',
         time.time() - start_time))
    return ans
示例#3
0
    def __init__(self, directory, cache_size=int(1e7)):
        """
        Opens a phrase extractor located in a given directory.

        `cache_size` determines the maximum number of vocabulary items
        that can be held in memory.
        """
        self.directory = directory
        self.cache_size = cache_size

        # Filenames of outputs of build_lexicon(...)
        self.fn_phrase_types = path.join(self.directory, "phrase_types.pickle")
        self.fn_lexicon = path.join(self.directory, "lexicon.vert")
        self.fn_discarded = path.join(self.directory, "discarded.vert")

        # Load configuration
        fn_config = path.join(self.directory, ManateeExtractor.FN_CONFIG)
        if not path.isfile(fn_config):
            raise WorkspaceNotFound(
                "Configuration file `%s` does not exist."
                "Did you forget to `ManateeExtractor.prepare` the directory?"
                % fn_config
            )
        with our_open(fn_config) as f:
            self.config = json.load(f)

        # Initialize Manatee objects
        self.corpus = manatee.Corpus(self.config["corpus"])
        self.struct = self.corpus.get_struct(self.config["struct"])
        self.lex_attr = self.corpus.get_attr(self.config["lex_attr"])

        if self.config["cql_attr"] is not None:
            self.corpus.set_default_attr(self.config["cql_attr"])

        self.struct_size = self.struct.size()
示例#4
0
 def get_data_path(self, corp_id):
     try:
         c = manatee.Corpus(os.path.join(self._reg_path, corp_id))
         return c.get_conf('PATH').rstrip('/')
     except Exception as ex:
         logging.getLogger(__name__).warning(ex)
         return None
示例#5
0
 def get_info(self, corpus_id: str) -> DefaultManateeCorpusInfo:
     try:
         if corpus_id not in self._cache:
             self._cache[corpus_id] = DefaultManateeCorpusInfo(manatee.Corpus(corpus_id), corpus_id)
         return self._cache[corpus_id]
     except:
         # probably a misconfigured/missing corpus
         return DefaultManateeCorpusInfo(EmptyCorpus(corpname=corpus_id), corpus_id)
示例#6
0
def analyze_corpus(corpname, corp_info):
    size = 0
    try:
        corp = manatee.Corpus(os.path.join(REGISTRY_PATH, corpname))
        size = corp.size()
    except:
        pass
    return (size, get_web(corp_info), get_keywords(corp_info))
示例#7
0
    def make(directory, corpus, struct, lex_attr, cql_attr=None):
        """
        Prepares a workspace for phrase extraction in a given directory.

        # Arguments
            directory: All intermediate data generated during
                phrase extraction will be saved to this directory.
            corpus: Name of a compiled corpus.
            struct: Name of a structure (typically sentence).
            lex_attr: Lexical attribute (typically lemma).
            cql_attr: Default CQL attribute.
                It can be used to shorten queries,
                e.g. you can use "N.*" instead of [tag="N.*"]
                if the default attribute is set to "tag".

        # Raises
            WorkspaceReserved: If there already exists
                a configuration file in the given directory.
        """
        fn_config = path.join(directory, ManateeExtractor.FN_CONFIG)
        if path.exists(fn_config):
            raise WorkspaceReserved(
                "Configuration file `%s` already exists."
                % fn_config
            )

        # Test whether the provided arguments
        # can be actually used.
        c = manatee.Corpus(corpus)
        c.get_struct(struct)
        c.get_attr(lex_attr)
        if cql_attr is not None:
            c.get_attr(cql_attr)

        # Create CQLExtractor's base directory
        if not path.exists(directory):
            makedirs(directory)

        # Create additional subdirectories
        sub_directories = [
            path.join(directory, sd)
            for sd in [ManateeExtractor.DN_RAW_POCS,
                       ManateeExtractor.DN_RAW_VOCABS]
        ]
        for sub_directory in sub_directories:
            if not path.exists(sub_directory):
                makedirs(sub_directory)

        # Create configuration file containing
        # basic info about this CQLExtractor
        config = dict(
            corpus=corpus,
            struct=struct,
            lex_attr=lex_attr,
            cql_attr=cql_attr,
        )
        with our_open(fn_config, "w") as config_file:
            json.dump(config, config_file)
示例#8
0
 def get_info(self, canonical_corpus_id):
     try:
         if canonical_corpus_id not in self._cache:
             self._cache[canonical_corpus_id] = ManateeCorpusInfo(
                 manatee.Corpus(canonical_corpus_id), canonical_corpus_id)
         return self._cache[canonical_corpus_id]
     except:
         # probably a misconfigured/missing corpus
         return ManateeCorpusInfo(EmptyCorpus(corpname=canonical_corpus_id),
                                  canonical_corpus_id)
示例#9
0
 def get_info(self, corpus_id):
     try:
         if corpus_id not in self._cache:
             self._cache[corpus_id] = DefaultManateeCorpusInfo(
                 manatee.Corpus(corpus_id), corpus_id)
         return self._cache[corpus_id]
     except Exception as ex:
         logging.getLogger(__name__).warning(ex)
         # probably a misconfigured/missing corpus
         return DefaultManateeCorpusInfo(EmptyCorpus(corpname=corpus_id),
                                         corpus_id)
示例#10
0
def _load_corp(corp_id, subc_path):
    """
    Instantiate a manatee.Corpus (or manatee.SubCorpus)
    instance

    arguments:
    corp_id -- a corpus identifier
    subc_path -- path to a subcorpus
    """
    corp = manatee.Corpus(corp_id)
    if subc_path:
        corp = manatee.SubCorpus(corp, subc_path)
    corp.corpname = corp_id
    return corp
示例#11
0
    def get_Corpus(self,
                   corpname: str,
                   corp_variant: str = '',
                   subcname: str = '',
                   decode_desc: bool = True) -> Corpus:
        """
        args:
            corp_variant: a registry file path prefix for (typically) limited variant of a corpus;
                          please note that in many cases this can be omitted as only in case user
                          wants to see a continuous text (e.g. kwic context) we must make sure he
                          sees only a 'legal' chunk.
        """
        if ':' in corpname:
            corpname, subcname = corpname.split(':', 1)

        public_subcname = self.get_subc_public_name(corpname, subcname)
        cache_key = (corpname, corp_variant, subcname, public_subcname)
        if cache_key in self._cache:
            return self._cache[cache_key]
        registry_file = os.path.join(corp_variant,
                                     corpname) if corp_variant else corpname
        self._ensure_reg_file(registry_file, corp_variant)
        corp = manatee.Corpus(registry_file)
        corp.corpname = str(corpname)  # never unicode (paths)
        corp.is_published = False
        corp.author = None
        corp.author_id = None

        # NOTE: line corp.cm = self (as present in NoSke and older KonText versions) has
        # been causing file descriptor leaking for some operations (e.g. corp.get_attr).
        # KonText does not need such an attribute but to keep developers informed I leave
        # the comment here.
        if subcname:
            if public_subcname:
                subcname = public_subcname
            for sp in self.subcpath:
                spath = os.path.join(sp, corpname, subcname + '.subc')
                if os.path.isfile(spath):
                    subc = self._open_subcorpus(corpname, subcname, corp,
                                                spath, decode_desc)
                    self._cache[cache_key] = subc
                    return subc
            raise RuntimeError(_('Subcorpus "%s" not found') % subcname)
        else:
            self._cache[cache_key] = corp
        return corp
示例#12
0
def get_existing_conc(corp: manatee.Corpus,
                      q: Tuple[str, ...]) -> manatee.Concordance:
    cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)
    subchash = getattr(corp, 'subchash', None)
    status = cache_map.get_calc_status(subchash, q)
    if status is None:
        raise ConcNotFoundException('Concordance not found.')
    if status.finished and status.readable:
        mcorp = corp
        for qq in reversed(q):  # find the right main corp, if aligned
            if qq.startswith('x-'):
                mcorp = manatee.Corpus(qq[2:])
                break
        return PyConc(mcorp, 'l', status.cachefile, orig_corp=corp)
    raise BrokenConcordanceException(
        'Concordance broken. File: {}, error: {}'.format(
            status.cachefile, status.error))
示例#13
0
    def get_Corpus(self,
                   corpname,
                   corp_variant='',
                   subcname='',
                   decode_desc=True):
        """
        args:
            corp_variant: a registry file path prefix for (typically) limited variant of a corpus;
                          please note that in many cases this can be omitted as only in case user
                          wants to see a continuous text (e.g. kwic context) we must make sure he
                          sees only a 'legal' chunk.
        """
        if ':' in corpname:
            corpname, subcname = corpname.split(':', 1)

        public_subcname = self.get_subc_public_name(corpname, subcname)
        cache_key = (corpname, corp_variant, subcname, public_subcname)
        if cache_key in self._cache:
            return self._cache[cache_key]
        registry_file = os.path.join(corp_variant,
                                     corpname) if corp_variant else corpname
        self._ensure_reg_file(registry_file, corp_variant)
        corp = manatee.Corpus(registry_file)
        corp.corpname = str(corpname)  # never unicode (paths)
        corp.is_published = False
        corp.cm = self

        if subcname:
            if public_subcname:
                subcname = public_subcname
            for sp in self.subcpath:
                spath = os.path.join(sp, corpname, subcname + '.subc')
                if type(spath) == unicode:
                    spath = spath.encode('utf-8')
                if os.path.isfile(spath):
                    subc = self._open_subcorpus(corpname, subcname, corp,
                                                spath, decode_desc)
                    self._cache[cache_key] = subc
                    return subc
            raise RuntimeError(_('Subcorpus "%s" not found') % subcname)
        else:
            self._cache[cache_key] = corp
        return corp
示例#14
0
 def get_Corpus(self, corpname, corp_variant='', subcname=''):
     """
     args:
         corp_variant: a registry file path prefix for (typically) limited variant of a corpus;
                       please note that in many cases this can be omitted as only in case user
                       wants to see a continuous text (e.g. kwic context) we must make sure he
                       sees only a 'legal' chunk.
     """
     if ':' in corpname:
         corpname, subcname = corpname.split(':', 1)
     registry_file = os.path.join(corp_variant,
                                  corpname) if corp_variant else corpname
     corp = manatee.Corpus(registry_file)
     corp.corpname = str(corpname)  # never unicode (paths)
     corp.cm = self
     dsubcpath = self.default_subcpath(corp)
     if subcname:
         for sp in self.subcpath + [dsubcpath]:
             if sp == dsubcpath:
                 spath = os.path.join(sp, subcname + '.subc')
             else:
                 spath = os.path.join(sp, corpname, subcname + '.subc')
             if type(spath) == unicode:
                 spath = spath.encode("utf-8")
             if os.path.isfile(spath):
                 subc = manatee.SubCorpus(corp, spath)
                 subc.corp = corp
                 subc.spath = spath
                 try:
                     open(spath[:-4] + 'used', 'w')
                 except IOError:
                     pass
                 subc.corpname = str(corpname)  # never unicode (paths)
                 subc.subcname = subcname
                 subc.cm = self
                 subc.subchash = md5(open(spath).read()).hexdigest()
                 subc.created = datetime.fromtimestamp(
                     int(os.path.getctime(spath)))
                 return subc
         raise RuntimeError(_('Subcorpus "%s" not found') % subcname)
     else:
         return corp
示例#15
0
    def get_corpus(self,
                   corpname: str,
                   corp_variant: str = '',
                   subcname: str = '',
                   decode_desc: bool = True) -> AbstractKCorpus:
        """
        args:
            corp_variant: a registry file path prefix for (typically) limited variant of a corpus;
                          please note that in many cases this can be omitted as only in case user
                          wants to see a continuous text (e.g. kwic context) we must make sure he
                          sees only a 'legal' chunk.
        """
        public_subcname = self.get_subc_public_name(corpname, subcname)
        registry_file = self._ensure_reg_file(corpname, corp_variant)
        cache_key = (registry_file, subcname, public_subcname)
        if cache_key in self._cache:
            return self._cache[cache_key]
        corp = manatee.Corpus(registry_file)

        # NOTE: line corp.cm = self (as present in NoSke and older KonText versions) has
        # been causing file descriptor leaking for some operations (e.g. corp.get_attr).
        # KonText does not need such an attribute but to keep developers informed I leave
        # the comment here.
        if subcname:
            if public_subcname:
                subcname = public_subcname
            for sp in self.subcpath:
                spath = os.path.join(sp, corpname, subcname + '.subc')
                if os.path.isfile(spath):
                    subc = KSubcorpus.load(corp, corpname, subcname, spath,
                                           decode_desc)
                    self._cache[cache_key] = subc
                    return subc
            raise RuntimeError(_('Subcorpus "{}" not found').format(
                subcname))  # TODO error type
        else:
            kcorp = KCorpus(corp, corpname)
            self._cache[cache_key] = kcorp
        return kcorp
示例#16
0
def fcs_scan(corpname, scan_query, max_ter, start):
    """
    aux function for federated content search: operation=scan
    """
    if not scan_query:
        raise Exception(7, 'scan_query', 'Mandatory parameter not supplied')
    query = scan_query.replace('+', ' ')  # convert URL spaces
    exact_match = False
    if 'exact' in query.lower() and not '=' in query:  # lemma ExacT "dog"
        pos = query.lower().index('exact')  # first occurence of EXACT
        query = query[:pos] + '=' + query[pos + 5:]  # 1st exact > =
        exact_match = True
    corp = manatee.Corpus(corpname)
    attrs = corp.get_conf('ATTRLIST').split(',')  # list of available attrs
    try:
        if '=' in query:
            attr, value = query.split('=')
            attr = attr.strip()
            value = value.strip()
        else:  # must be in format attr = value
            raise Exception
        if '"' in attr:
            raise Exception
        if '"' in value:
            if value[0] == '"' and value[-1] == '"':
                value = value[1:-1].strip()
            else:
                raise Exception
    except Exception:
        raise Exception(10, scan_query, 'Query syntax error')
    if not attr in attrs:
        raise Exception(16, attr, 'Unsupported index')
    import corplib
    if exact_match:
        wlpattern = '^' + value + '$'
    else:
        wlpattern = '.*' + value + '.*'
    wl = corplib.wordlist(corp, wlattr=attr, wlpat=wlpattern, wlsort='f')
    return [(d['str'], d['freq']) for d in wl][start:][:max_ter]
示例#17
0
def _get_cached_conc(corp, subchash, q, minsize):
    """
    Loads a concordance from cache. The function
    tries to find at least a sublist of 'q' (starting
    from zero) to avoid full concordance search if
    possible.

    arguments:
    corp -- a respective manatee.Corpus object
    subchash -- a subcorpus hash (generated by PyConc)
    q -- a query representation list
    minsize -- a minimum concordance size to return immediately (synchronously)

    returns:
    a 2-tuple [an index within 'q' where to start with non-cached results], [a concordance instance]
    """
    start_time = time.time()
    q = tuple(q)

    cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)
    cache_map.refresh_map()
    if _contains_shuffle_seq(q):
        srch_from = 1
    else:
        srch_from = len(q)

    ans = (0, None)
    # try to find the most complete cached operation
    # (e.g. query + filter + sample)
    for i in range(srch_from, 0, -1):
        cachefile = cache_map.cache_file_path(subchash, q[:i])
        if cachefile:
            try:
                _wait_for_conc(cache_map=cache_map,
                               subchash=subchash,
                               q=q[:i],
                               minsize=minsize)
            except ConcCalculationControlException as ex:
                _cancel_async_task(cache_map, subchash, q[:i])
                logging.getLogger(__name__).warning(
                    'Removed broken concordance cache record. Original error: %s'
                    % (ex, ))
                continue
            conccorp = corp
            for qq in reversed(q[:i]):  # find the right main corp, if aligned
                if qq.startswith('x-'):
                    conccorp = manatee.Corpus(qq[2:])
                    break
            conc = None
            try:
                if not _min_conc_unfinished(cache_map=cache_map,
                                            subchash=subchash,
                                            q=q[:i],
                                            minsize=minsize):
                    conc = PyConc(conccorp, 'l', cachefile, orig_corp=corp)
            except (ConcCalculationControlException,
                    manatee.FileAccessError) as ex:
                logging.getLogger(__name__).error(
                    'Failed to join unfinished calculation: {0}'.format(ex))
                _cancel_async_task(cache_map, subchash, q[:i])
                continue
            ans = (i, conc)
            break
    logging.getLogger(__name__).debug(
        'get_cached_conc(%s, [%s]) -> %s, %01.4f' %
        (corp.corpname, ','.join(q), 'hit' if ans[1] else 'miss',
         time.time() - start_time))
    return ans
示例#18
0
 def _open_corpus(self, corpname):
     if corpname not in self._corp_cache:
         self._corp_cache[corpname] = manatee.Corpus(corpname)
     return self._corp_cache[corpname]
示例#19
0
def _load_corp(corp_id, subc_path):
    corp = manatee.Corpus(corp_id)
    if subc_path:
        corp = manatee.SubCorpus(corp, subc_path)
    corp.corpname = corp_id
    return corp
示例#20
0
 def get_corpus_size(self, corp_id):
     c = manatee.Corpus(os.path.join(self._reg_path, corp_id))
     return c.size()
示例#21
0
 def corp_factory(reg_path):
     return manatee.Corpus(reg_path)
示例#22
0
 def get_corpus_name(self, corp_id):
     try:
         c = manatee.Corpus(os.path.join(self._reg_path, corp_id))
         return c.get_conf('NAME').decode(self.get_corpus_encoding(corp_id))
     except:
         return None
示例#23
0
 def get_corpus_description(self, corp_id):
     try:
         c = manatee.Corpus(os.path.join(self._reg_path, corp_id))
         return c.get_conf('INFO').decode(self.get_corpus_encoding(corp_id))
     except:
         return None
示例#24
0
 def get_corpus_encoding(self, corp_id):
     try:
         c = manatee.Corpus(os.path.join(self._reg_path, corp_id))
         return c.get_conf('ENCODING')
     except:
         return None
示例#25
0
def open_corpus(*args, **kwargs):
    """
    Creates a manatee.Corpus instance
    """
    return manatee.Corpus(*args, **kwargs)
示例#26
0
def find_cached_conc_base(
        corp: manatee.Corpus, subchash: Optional[str], q: Tuple[str, ...],
        minsize: int) -> Tuple[Optional[int], manatee.Concordance]:
    """
    Load a concordance from cache starting from a complete operation q[:],
    then trying q[:-1], q[:-2], q:[:-i] etc. A possible found concordance can be
    used to skip calculation of already available operations q[:-i].

    arguments:
    minsize -- a minimum concordance size to return immediately (synchronously); please
                note that unlike wait_for_conc here we accept also 0

    returns:
    a 2-tuple [an index within 'q' where to start with non-cached results], [a concordance instance]
    """
    start_time = time.time()
    cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)
    cache_map.refresh_map()
    calc_status = cache_map.get_calc_status(subchash, q)
    if calc_status:
        if calc_status.error is None:
            corp_mtime = corplib_corp_mtime(corp)
            if calc_status.created - corp_mtime < 0:
                logging.getLogger(__name__).warning(
                    'Removed outdated cache file (older than corpus indices)')
                cache_map.del_full_entry(subchash, q)
        else:
            logging.getLogger(__name__).warning(
                'Removed failed calculation cache record (error: {0}'.format(
                    calc_status.error))
            cache_map.del_full_entry(subchash, q)

    if _contains_shuffle_seq(q):
        srch_from = 1
    else:
        srch_from = len(q)

    conc = EmptyConc(corp=corp)
    ans = (0, conc)
    # try to find the most complete cached operation
    # (e.g. query + filter + sample)
    for i in range(srch_from, 0, -1):
        cache_path = cache_map.cache_file_path(subchash, q[:i])
        # now we know that someone already calculated the conc (but it might not be finished yet)
        if cache_path:
            try:
                ready = wait_for_conc(cache_map=cache_map,
                                      subchash=subchash,
                                      q=q[:i],
                                      minsize=minsize)
                if not ready:
                    if minsize != 0:
                        cancel_async_task(cache_map, subchash, q[:i])
                        logging.getLogger(__name__).warning(
                            'Removed unfinished concordance cache record due to exceeded time limit'
                        )
                    continue
                _, finished = _check_result(cache_map=cache_map,
                                            subchash=subchash,
                                            q=q[:i],
                                            minsize=minsize)
                if finished:
                    mcorp = corp
                    for qq in reversed(
                            q[:i]):  # find the right main corp, if aligned
                        if qq.startswith('x-'):
                            mcorp = manatee.Corpus(qq[2:])
                            break
                    conc = PyConc(mcorp, 'l', cache_path, orig_corp=corp)
            except (ConcCalculationStatusException,
                    manatee.FileAccessError) as ex:
                logging.getLogger(__name__).error(
                    f'Failed to use cached concordance for {q[:i]}: {ex}')
                cancel_async_task(cache_map, subchash, q[:i])
                continue
            ans = (i, conc)
            break
    logging.getLogger(__name__).debug(
        f'get_cached_conc({corp.get_conffile()}, [{", ".join(q)}]), '
        f'conc: {conc.__class__.__name__}, '
        f'missing ops start idx: {i if i < len(q) else "none"}, '
        f'time: {(time.time() - start_time):.4f}')
    return ans
示例#27
0
def get_corpus_size(corpus_id, reg_dir):
    corp = manatee.Corpus(os.path.join(reg_dir, corpus_id))
    return corp.size() if corp else 0
示例#28
0
 def get_data_path(self, corp_id):
     try:
         c = manatee.Corpus(os.path.join(self._reg_path, corp_id))
         return c.get_conf('PATH').rstrip('/')
     except:
         return None
示例#29
0
文件: SeaCOW.py 项目: rsling/seacow
  def run(self):

    # Check whether query is prepared.
    if self.corpus is None:
      raise QueryError('You must specify the corpus to do a search.')
    if self.attributes is None:
      raise QueryError('You must specify at least one attribute to do a search.')
    if self.structures is None:
      raise QueryError('You must specify at least one structure to do a search.')
    if self.references is None:
      raise QueryError('You must specify at least one reference to do a search.')
    if self.container  is None and not issubclass(type(self.processor), Nonprocessor):
      raise QueryError('You must specify the container to do a search.')
    if self.string is None or self.string is '':
      raise QueryError('You must set the string property to a search string.')

    # Check whether processor of proper type
    if self.processor and not issubclass(type(self.processor), Processor):
      raise QueryError('The processor class must inherit from SeaCOW.Processor.')

    # Emit heuristic warning that container might end up being to small.
    # This warns about the behviour reported 2020 by EP.
    q_pattern = r'.* within *<' + self.container + r'(| [^>]+)/>.*'
    q_string = r'within <' + self.container + r'/>'
    if not re.match(q_pattern, self.string):
      print("WARNING! Your query should probably end in '" + q_string + "' or your match might exceed the exported container.")
      if self.context_left == 0 or self.context_right == 0:
        print(" ... especially because at least one of your contexts is 0!")
      print(" ... Watch out for 'Index anomaly' warnings.")
      print


    # Allow the processor to engage in preparatory action/check whether everything is fine.
    if self.processor:
      self.processor.prepare(self)

    # Set up and run query.
    h_corpus      = manatee.Corpus(self.corpus)
    if self.subcorpus is not None:
        # If subcorpus name is given (instead of path), figure out full path to subcorpus .subc file.
        if not "/" in self.subcorpus:
            self.subcorpus = h_corpus.get_conf("PATH") + "subcorp/" + re.sub("\.subc$", "", self.subcorpus.strip(" /")) + ".subc"
        if os.path.exists(self.subcorpus):
            h_corpus = manatee.SubCorpus (h_corpus, self.subcorpus)
        else:
            raise QueryError('The requested subcorpus cannot be found.')

    if not issubclass(type(self.processor), Nonprocessor):
      h_region      = manatee.CorpRegion(h_corpus, ','.join(self.attributes), ','.join(self.structures))
      h_cont        = h_corpus.get_struct(self.container)
      h_refs        = [h_corpus.get_attr(r) for r in self.references]

    start_time    = time.time()
    results       = h_corpus.eval_query(self.string)

    # Process results.
    counter  = 0
    dup_no   = 0

    # In case class is "Noprocessor", we do not process the stream.
    if issubclass(type(self.processor), Nonprocessor):

      # Store the hit count as reported.
      self.hits = results.count_rest()
    else:
      while not results.end() and (self.max_hits < 0 or counter < self.max_hits):

        # Skip randomly if random subset desired.
        if self.random_subset > 0 and random.random() > self.random_subset:
          results.next()
          continue

        kwic_beg = results.peek_beg()                                  # Match begin.
        kwic_end = results.peek_end()                                  # Match end.
        cont_beg_num = h_cont.num_at_pos(kwic_beg)-self.context_left   # Container at match begin.
        cont_end_num = h_cont.num_at_pos(kwic_beg)+self.context_right  # Container at match end.

        # If hit not in desired region, drop.
        if cont_beg_num < 0 or cont_end_num < 0:
          results.next()
          continue

        cont_beg_pos = h_cont.beg(cont_beg_num)                   # Pos at container begin.
        cont_end_pos = h_cont.end(cont_end_num)                   # Pos at container end.

        refs = [h_refs[i].pos2str(kwic_beg) for i in range(0, len(h_refs))]
        region = h_region.region(cont_beg_pos, cont_end_pos, '\t', '\t')

        # Deduping.
        if type(self.bloom) is pybloom_live.ScalableBloomFilter:
          dd_region = ''.join([region[i].strip().lower() for i in range(0, len(region), 1+len(self.attributes))])
          if {dd_region : 0} in self.bloom:
            dup_no += 1
            results.next()
            continue
          else:
            self.bloom.add({dd_region : 0})

        # Call the processor.
        if self.processor:
          self.processor.process(self, region, refs, kwic_beg - cont_beg_pos, kwic_end - kwic_beg)

        # Advance stream/loop.
        results.next()
        counter = counter + 1

      # After loop but inside "if not Nonprocessor", set hit count.
      self.hits          = counter

    self.querytime     = strftime("%Y-%m-%d %H:%M:%S", gmtime())
    self.duplicates    = dup_no
    self.elapsed       = time.time()-start_time

    # Allow the processor to finalise its job.
    if self.processor:
      self.processor.finalise(self)
示例#30
0
def main():
    fmt = '[%(asctime)-15s] %(levelname)s: %(message)s'
    logging.basicConfig(level=logging.INFO, format=fmt)

    m = argparse.ArgumentDefaultsHelpFormatter
    p = argparse.ArgumentParser(description="", formatter_class=m)
    p.add_argument("-c", "--corpus", type=str, required=True)
    p.add_argument("-d", "--db", type=str, required=True)
    p.add_argument("-o", "--outfile", type=str, required=True)

    args = p.parse_args()

    log.info("opening database {}".format(args.db))
    db = sqlite3.connect(args.db)
    db.isolation_level = None  # I want to handle transactions myself
    log.info("opening corpus {}".format(args.corpus))
    corp = manatee.Corpus(args.corpus)
    log.info("corpus has %d positions" % corp.size())

    log.info("reading annotations")
    attrs = read_annots(corp, db)

    headers_simple = []
    headers_multi = []

    for k, v in default_annot_values.iteritems():
        if k not in simple_attributes:
            continue
        if len(v) > 2:
            for vv in v:
                headers_multi.append((k, vv, (k + "_" + vv).replace(' ', '-')))
        else:
            headers_simple.append(k)

    log.info("reading corpus text")

    doc = corp.get_struct('doc')
    docsize = corp.get_struct('doc').size()
    fn = doc.get_attr('filename')

    with open('labels.csv', 'w') as lf:
        for x in headers_simple:
            print(x, file=lf)
        for x, y, z in headers_multi:
            print(z.encode('utf-8'), file=lf)

    print("Grouping most common answer")
    most_common = attrs.groupby(["docid",
                                 "name"]).agg(lambda x: pd.Series.mode(x)[0])

    print("Converting dataset")
    df = most_common.unstack()
    df = df[~df.isna().any(axis=1)]
    df.columns = df.columns.droplevel()
    df.columns.name = None
    df.index.name = None

    print("Getting text")
    df["text"] = df.index.map(lambda docid: text(corp, docid, 'word'))

    print("Writing dataset")
    df.to_csv("data.csv", index=False)