def _get_form2lemma(a_fname):
    """Read file containing form/lemma correspodences

    @param a_fname - name of input file

    @return void (correspondences are read into global variables)

    """
    global STOP_WORDS, FORM2LEMMA

    if not os.path.isfile(a_fname) or not os.access(a_fname, os.R_OK):
        raise RuntimeError("Cannot read from file '{:s}'".format(
            a_fname))

    iform = itag = ilemma = ""
    with codecs.open(a_fname, 'r', encoding=ENCODING) as ifile:
        for iline in ifile:
            iline = iline.strip()
            if not iline:
                continue
            iform, itag, ilemma = TAB_RE.split(iline)
            iform = normalize(iform)
            if len(itag) > 1 and itag[:2] in INFORMATIVE_TAGS:
                FORM2LEMMA[iform] = normalize(ilemma)
            else:
                STOP_WORDS.add(iform)
示例#2
0
def _tkm_add_corpus(ising, a_cc_file):
    """Add lexical nodes from corpus to the Ising spin model

    @param a_ising - instance of the Ising spin model
    @param a_cc_file - file containing conjoined word pairs extracted from
      corpus

    @return \c void

    """
    ifields = []
    iwght = 1.
    ilemma1 = ilemma2 = ""
    with codecs.open(a_cc_file, 'r', ENCODING) as ifile:
        for iline in ifile:
            iline = iline.strip()
            if not iline:
                continue
            ifields = TAB_RE.split(iline)
            if len(ifields) != 3:
                continue
            ilemma1, ilemma2, iwght = ifields
            if ilemma1 in FORM2LEMMA:
                ilemma1 = FORM2LEMMA[ilemma1]
            if ilemma2 in FORM2LEMMA:
                ilemma2 = FORM2LEMMA[ilemma2]
            if check_word(ilemma1) and check_word(ilemma2):
                ising.add_edge(normalize(ilemma1),
                               normalize(ilemma2),
                               float(iwght),
                               a_add_missing=True)
def _get_form2lemma(a_fname):
    """Read file containing form/lemma correspodences

    @param a_fname - name of input file

    @return void (correspondences are read into global variables)

    """
    global STOP_WORDS, FORM2LEMMA

    if not os.path.isfile(a_fname) or not os.access(a_fname, os.R_OK):
        raise RuntimeError("Cannot read from file '{:s}'".format(a_fname))

    iform = itag = ilemma = ""
    with codecs.open(a_fname, 'r', encoding=ENCODING) as ifile:
        for iline in ifile:
            iline = iline.strip()
            if not iline:
                continue
            iform, itag, ilemma = TAB_RE.split(iline)
            iform = normalize(iform)
            if len(itag) > 1 and itag[:2] in INFORMATIVE_TAGS:
                FORM2LEMMA[iform] = normalize(ilemma)
            else:
                STOP_WORDS.add(iform)
示例#4
0
def _output_cc_helper(a_cc_list):
    """
    Extract sets of coordinatively and adversatively conjoined terms

    @param a_cc_list - list of conjoined phrases

    @return 2-tuple with sets of coordinatively and adversatively conjoined terms
    """
    coord = set()
    advers = set()
    ret = (coord, advers)
    trg_coord = trg_advers = chld_coord = chld_advers = None

    chld_ret = None
    if type(a_cc_list) == list:
        if not a_cc_list:
            return ret
        a_cc_list[0] = normalize(a_cc_list[0])
        if a_cc_list[0] in ADVERS_CC:
            trg_coord, trg_advers = advers, coord
        else:
            if a_cc_list[0] not in COORD_CC:
                if VERBOSE:
                    print("WARNING: Unknown coordinative conjunction: '{:s}'".format(repr(a_cc_list[0])), \
                              file = sys.stderr)
                coord.add(a_cc_list[0])
            trg_coord, trg_advers = coord, advers
        for chld in a_cc_list[1:]:
            chld_coord, chld_advers = _output_cc_helper(chld)
            trg_coord |= chld_coord
            trg_advers |= chld_advers
    else:
        coord.add(a_cc_list)
    return ret
示例#5
0
def _process_cc_helper(a_tree, a_iroot, a_cc_main, a_cc_seen=False):
    """
    Find coordinarively conjoined phrases in DG tree

    @param a_tree - DG tree to process
    @param a_index - root node of the tree
    @param a_cc_main - main list of coordinarively conjoined phrases to be
                       populated
    @param a_cc_seen - particular list of coordinarively conjoined phrases
                       coming from parent

    @return \c void
    """
    if a_tree.is_empty():
        return

    ret = [normalize(a_iroot.plemma)]
    # print("ret =", repr(ret), file = sys.stderr)
    # extract coordinarively conjoined chains
    for ichild in a_tree.children[a_iroot.idx]:
        # print("ichild.pdeprel =", repr(ichild.pdeprel), file = sys.stderr)
        if ichild.pdeprel in CC_RELATIONS:
            ret.append(_process_cc_helper(a_tree, ichild, a_cc_main, True))
        else:
            a_cc_main += _process_cc_helper(a_tree, ichild, a_cc_main, False)
    if len(ret) != 1:
        if a_cc_seen:
            return ret
        else:
            a_cc_main.append(ret)
            return []
    elif a_cc_seen:
        return ret[0]
    else:
        return []
示例#6
0
def _crp2mtx(a_crp_files, a_pos, a_neg,
             a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE):
    """Construct sparse collocation matrix from raw corpus.

    @param a_crp_files - files of the original corpus
    @param a_pos - initial set of positive terms
    @param a_neg - initial set of negative terms
    @param a_pos_re - regular expression for matching positive terms
    @param a_neg_re - regular expression for matching negative terms

    @return (dict, mtx) - number of tokens, mapping from tokens to vector ids,
    and adjacency matrix

    """
    # gather one-direction co-occurrence statistics
    max_vecid, word2vecid, tok_stat = _read_files(a_crp_files, a_pos, a_neg,
                                                  a_pos_re, a_neg_re)
    for w in chain(a_pos, a_neg):
        w = normalize(w)
        if w not in word2vecid:
            word2vecid[w] = max_vecid
            max_vecid += 1
    # convert cooccurrence statistics to a sparse matrix
    M = _tokstat2mtx(max_vecid, tok_stat)
    # iterate over the matrix and keep top 25 vectors with the highest cosine
    # similarity
    _prune_mtx(M)
    return (max_vecid, word2vecid, M.log1p())
示例#7
0
def _output_cc_helper(a_cc_list):
    """
    Extract sets of coordinatively and adversatively conjoined terms

    @param a_cc_list - list of conjoined phrases

    @return 2-tuple with sets of coordinatively and adversatively conjoined terms
    """
    coord = set(); advers = set()
    ret = (coord, advers)
    trg_coord = trg_advers = chld_coord = chld_advers = None

    chld_ret = None
    if type(a_cc_list) == list:
        if not a_cc_list:
            return ret
        a_cc_list[0] = normalize(a_cc_list[0])
        if a_cc_list[0] in ADVERS_CC:
            trg_coord, trg_advers = advers, coord
        else:
            if a_cc_list[0] not in COORD_CC:
                if VERBOSE:
                    print("WARNING: Unknown coordinative conjunction: '{:s}'".format(repr(a_cc_list[0])), \
                              file = sys.stderr)
                coord.add(a_cc_list[0])
            trg_coord, trg_advers = coord, advers
        for chld in a_cc_list[1:]:
            chld_coord, chld_advers = _output_cc_helper(chld)
            trg_coord |= chld_coord; trg_advers |= chld_advers
    else:
        coord.add(a_cc_list)
    return ret
示例#8
0
def _process_cc_helper(a_tree, a_iroot, a_cc_main, a_cc_seen = False):
    """
    Find coordinarively conjoined phrases in DG tree

    @param a_tree - DG tree to process
    @param a_index - root node of the tree
    @param a_cc_main - main list of coordinarively conjoined phrases to be
                       populated
    @param a_cc_seen - particular list of coordinarively conjoined phrases
                       coming from parent

    @return \c void
    """
    if a_tree.is_empty():
        return

    ret = [normalize(a_iroot.plemma)]
    # print("ret =", repr(ret), file = sys.stderr)
    # extract coordinarively conjoined chains
    for ichild in a_tree.children[a_iroot.idx]:
        # print("ichild.pdeprel =", repr(ichild.pdeprel), file = sys.stderr)
        if ichild.pdeprel in CC_RELATIONS:
            ret.append(_process_cc_helper(a_tree, ichild, a_cc_main, True))
        else:
            a_cc_main += _process_cc_helper(a_tree, ichild, a_cc_main, False)
    if len(ret) != 1:
        if a_cc_seen:
            return ret
        else:
            a_cc_main.append(ret)
            return []
    elif a_cc_seen:
        return ret[0]
    else:
        return []
示例#9
0
def _crp2mtx(a_crp_files,
             a_pos,
             a_neg,
             a_pos_re=NONMATCH_RE,
             a_neg_re=NONMATCH_RE):
    """Construct sparse collocation matrix from raw corpus.

    @param a_crp_files - files of the original corpus
    @param a_pos - initial set of positive terms
    @param a_neg - initial set of negative terms
    @param a_pos_re - regular expression for matching positive terms
    @param a_neg_re - regular expression for matching negative terms

    @return (dict, mtx) - number of tokens, mapping from tokens to vector ids,
    and adjacency matrix

    """
    # gather one-direction co-occurrence statistics
    max_vecid, word2vecid, tok_stat = _read_files(a_crp_files, a_pos, a_neg,
                                                  a_pos_re, a_neg_re)
    for w in chain(a_pos, a_neg):
        w = normalize(w)
        if w not in word2vecid:
            word2vecid[w] = max_vecid
            max_vecid += 1
    # convert cooccurrence statistics to a sparse matrix
    M = _tokstat2mtx(max_vecid, tok_stat)
    # iterate over the matrix and keep top 25 vectors with the highest cosine
    # similarity
    _prune_mtx(M)
    return (max_vecid, word2vecid, M.log1p())
def _read_set(a_fname):
    """Read initial seed set of terms.

    @param a_fname - name of input file containing terms

    @return void

    """
    global POS_SET, NEG_SET, NEUT_SET, POS_RE, NEG_RE
    fields = []
    pos_regs = []
    neg_regs = []
    with codecs.open(a_fname, 'r',
                     encoding=ENCODING) as ifile:
        for iline in ifile:
            iline = iline.strip()
            if not iline:
                continue
            elif iline.startswith(COMMENT):
                # maybe, we will later introduce some special comments
                continue
            fields = TAB_RE.split(iline)
            if len(fields) > 2 and fields[2] == REGEXP:
                if fields[1] == POSITIVE:
                    pos_regs.append(normalize_reg(fields[0]))
                elif fields[1] == NEGATIVE:
                    neg_regs.append(normalize_reg(fields[0]))
                else:
                    raise NotImplementedError(
                        "Regular expressions are not supported"
                        " for non-polar classes.")
                continue
            if fields[1] == POSITIVE:
                POS_SET.add(normalize(fields[0]))
            elif fields[1] == NEGATIVE:
                NEG_SET.add(normalize(fields[0]))
            elif fields[1] == NEUTRAL:
                NEUT_SET.add(normalize(fields[0]))
            else:
                raise RuntimeError(
                    "Unknown field specification: {:s}".format(fields[-1]))
    if pos_regs:
        POS_RE = join_regs(pos_regs)
    if neg_regs:
        NEG_RE = join_regs(neg_regs)
示例#11
0
def _read_set(a_fname):
    """Read initial seed set of terms.

    @param a_fname - name of input file containing terms

    @return void

    """
    global POS_SET, NEG_SET, NEUT_SET, POS_RE, NEG_RE
    fields = []
    pos_regs = []
    neg_regs = []
    with codecs.open(a_fname, 'r', encoding=ENCODING) as ifile:
        for iline in ifile:
            iline = iline.strip()
            if not iline:
                continue
            elif iline.startswith(COMMENT):
                # maybe, we will later introduce some special comments
                continue
            fields = TAB_RE.split(iline)
            if len(fields) > 2 and fields[2] == REGEXP:
                if fields[1] == POSITIVE:
                    pos_regs.append(normalize_reg(fields[0]))
                elif fields[1] == NEGATIVE:
                    neg_regs.append(normalize_reg(fields[0]))
                else:
                    raise NotImplementedError(
                        "Regular expressions are not supported"
                        " for non-polar classes.")
                continue
            if fields[1] == POSITIVE:
                POS_SET.add(normalize(fields[0]))
            elif fields[1] == NEGATIVE:
                NEG_SET.add(normalize(fields[0]))
            elif fields[1] == NEUTRAL:
                NEUT_SET.add(normalize(fields[0]))
            else:
                raise RuntimeError("Unknown field specification: {:s}".format(
                    fields[-1]))
    if pos_regs:
        POS_RE = join_regs(pos_regs)
    if neg_regs:
        NEG_RE = join_regs(neg_regs)
示例#12
0
def lemmatize(a_form, a_prune=True):
    """
    Convert word form to its lemma

    @param a_form - word form for which we should obtain lemma
    @param a_prune - flag indicating whether uninformative words
                    should be pruned

    @return lemma of the word
    """
    a_form = normalize(a_form)
    if a_prune and a_form in STOP_WORDS:
        return None
    if a_form in FORM2LEMMA:
        return FORM2LEMMA[a_form]
    return a_form
示例#13
0
def _read_files_helper(a_crp_files, a_encoding=ENCODING):
    """Read corpus files and execute specified function.

    @param a_crp_files - files of the original corpus
    @param a_encoding - encoding of the vector file

    @return (Iterator over file lines)

    """
    i = 0
    tokens_seen = False
    for ifname in a_crp_files:
        with codecs.open(ifname, 'r', a_encoding) as ifile:
            for iline in ifile:
                iline = iline.strip().lower()
                if not iline or SENT_END_RE.match(iline):
                    continue
                elif iline[0] == ESC_CHAR:
                    if FASTMODE:
                        i += 1
                        if i > 300:
                            break
                    if tokens_seen:
                        tokens_seen = False
                        yield None, None, None
                    continue
                try:
                    iform, itag, ilemma = TAB_RE.split(iline)
                except:
                    print("Invalid line format at line: {:s}".format(
                        repr(iline)), file=sys.stderr
                    )
                    continue
                tokens_seen = True
                yield iform, itag, normalize(ilemma)
        yield None, None, None
示例#14
0
def _read_files_helper(a_crp_files, a_encoding=ENCODING):
    """Read corpus files and execute specified function.

    @param a_crp_files - files of the original corpus
    @param a_encoding - encoding of the vector file

    @return (Iterator over file lines)

    """
    i = 0
    tokens_seen = False
    for ifname in a_crp_files:
        with codecs.open(ifname, 'r', a_encoding) as ifile:
            for iline in ifile:
                iline = iline.strip().lower()
                if not iline or SENT_END_RE.match(iline):
                    continue
                elif iline[0] == ESC_CHAR:
                    if FASTMODE:
                        i += 1
                        if i > 300:
                            break
                    if tokens_seen:
                        tokens_seen = False
                        yield None, None, None
                    continue
                try:
                    iform, itag, ilemma = TAB_RE.split(iline)
                except:
                    print("Invalid line format at line: {:s}".format(
                        repr(iline)),
                          file=sys.stderr)
                    continue
                tokens_seen = True
                yield iform, itag, normalize(ilemma)
        yield None, None, None
示例#15
0
def tang(a_N,
         a_emb_fname,
         a_pos,
         a_neg,
         a_neut,
         a_pos_re=NONMATCH_RE,
         a_neg_re=NONMATCH_RE,
         a_encoding=ENCODING):
    """Method for generating sentiment lexicons using Velikovich's approach.

    @param a_N - number of terms to extract
    @param a_emb_fname - files of the original corpus
    @param a_pos - initial set of positive terms to be expanded
    @param a_neg - initial set of negative terms to be expanded
    @param a_neut - initial set of neutral terms to be expanded
    @param a_pos_re - regular expression for matching positive terms
    @param a_neg_re - regular expression for matching negative terms
    @param a_neg_re - regular expression for matching negative terms
    @param a_encoding - encoding of the vector file

    @return list of terms sorted according to their polarities

    """
    w2i, EMBS, ndim = read_embeddings(a_emb_fname, a_encoding)
    X, Y = digitize_trainset(w2i, a_pos, a_neg, a_neut, a_pos_re, a_neg_re)
    train, validate, predict, params = init_nnet(EMBS, len(set(Y)), ndim)
    best_params = []
    best_acc = acc = -1
    N = len(Y)
    train_idcs, devtest_idcs = train_test_split(np.arange(N), test_size=0.1)
    devtest_N = float(len(devtest_idcs))
    devtest_X = X[devtest_idcs]
    devtest_Y = Y[devtest_idcs]
    # train
    epoch_i = 0
    prev_cost = 0
    while epoch_i < MAX_EPOCHS:
        np.random.shuffle(train_idcs)
        cost = 0.
        start_time = datetime.utcnow()
        for idx in train_idcs:
            x_i, y_i = X[idx], Y[idx]
            cost += train(x_i, y_i)
        acc = 0.
        for x_i, y_i in zip(devtest_X, devtest_Y):
            acc += validate(x_i, y_i)
        acc /= devtest_N
        if acc >= best_acc:
            best_params = [p.get_value() for p in params]
            best_acc = acc
            sfx = " *"
        else:
            sfx = ''
        end_time = datetime.utcnow()
        tdelta = (end_time - start_time).total_seconds()
        print("Iteration #{:d} ({:.2f} sec): cost = {:.2f}, "
              "accuracy = {:.2%};{:s}".format(epoch_i, tdelta, cost, acc, sfx),
              file=sys.stderr)
        if abs(prev_cost - cost) < EPSILON and epoch_i > MIN_EPOCHS:
            break
        else:
            prev_cost = cost
        epoch_i += 1
    if best_params:
        for p, val in zip(params, best_params):
            p.set_value(val)
    # apply trained classifier to unseen data
    ret = []
    for w, w_idx in w2i.iteritems():
        if normalize(w) in a_pos or a_pos_re.match(w):
            pol_cls = POSITIVE_LBL
            pol_score = FMAX
        elif normalize(w) in a_neg or a_neg_re.match(w):
            pol_cls = NEGATIVE_LBL
            pol_score = FMIN
        else:
            pol_idx, pol_score = predict(w_idx)
            pol_score = pol_score.item(0)
            if pol_idx == POSITIVE_IDX:
                pol_cls = POSITIVE_LBL
            elif pol_idx == NEGATIVE_IDX:
                pol_cls = NEGATIVE_LBL
            else:
                continue
            ret.append((w, pol_cls, pol_score))
    ret.sort(key=lambda el: abs(el[-1]), reverse=True)
    return ret
示例#16
0
def tang(a_N, a_emb_fname, a_pos, a_neg, a_neut,
         a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE,
         a_encoding=ENCODING):
    """Method for generating sentiment lexicons using Velikovich's approach.

    @param a_N - number of terms to extract
    @param a_emb_fname - files of the original corpus
    @param a_pos - initial set of positive terms to be expanded
    @param a_neg - initial set of negative terms to be expanded
    @param a_neut - initial set of neutral terms to be expanded
    @param a_pos_re - regular expression for matching positive terms
    @param a_neg_re - regular expression for matching negative terms
    @param a_neg_re - regular expression for matching negative terms
    @param a_encoding - encoding of the vector file

    @return list of terms sorted according to their polarities

    """
    w2i, EMBS, ndim = read_embeddings(a_emb_fname, a_encoding)
    X, Y = digitize_trainset(w2i, a_pos, a_neg, a_neut,
                             a_pos_re, a_neg_re)
    train, validate, predict, params = init_nnet(EMBS,
                                                 len(set(Y)), ndim)
    best_params = []
    best_acc = acc = -1
    N = len(Y)
    train_idcs, devtest_idcs = train_test_split(np.arange(N),
                                                test_size=0.1)
    devtest_N = float(len(devtest_idcs))
    devtest_X = X[devtest_idcs]
    devtest_Y = Y[devtest_idcs]
    # train
    epoch_i = 0
    prev_cost = 0
    while epoch_i < MAX_EPOCHS:
        np.random.shuffle(train_idcs)
        cost = 0.
        start_time = datetime.utcnow()
        for idx in train_idcs:
            x_i, y_i = X[idx], Y[idx]
            cost += train(x_i, y_i)
        acc = 0.
        for x_i, y_i in zip(devtest_X, devtest_Y):
            acc += validate(x_i, y_i)
        acc /= devtest_N
        if acc >= best_acc:
            best_params = [p.get_value() for p in params]
            best_acc = acc
            sfx = " *"
        else:
            sfx = ''
        end_time = datetime.utcnow()
        tdelta = (end_time - start_time).total_seconds()
        print("Iteration #{:d} ({:.2f} sec): cost = {:.2f}, "
              "accuracy = {:.2%};{:s}".format(epoch_i, tdelta,
                                              cost, acc, sfx),
              file=sys.stderr)
        if abs(prev_cost - cost) < EPSILON and epoch_i > MIN_EPOCHS:
            break
        else:
            prev_cost = cost
        epoch_i += 1
    if best_params:
        for p, val in zip(params, best_params):
            p.set_value(val)
    # apply trained classifier to unseen data
    ret = []
    for w, w_idx in w2i.iteritems():
        if normalize(w) in a_pos or a_pos_re.match(w):
            pol_cls = POSITIVE_LBL
            pol_score = FMAX
        elif normalize(w) in a_neg or a_neg_re.match(w):
            pol_cls = NEGATIVE_LBL
            pol_score = FMIN
        else:
            pol_idx, pol_score = predict(w_idx)
            pol_score = pol_score.item(0)
            if pol_idx == POSITIVE_IDX:
                pol_cls = POSITIVE_LBL
            elif pol_idx == NEGATIVE_IDX:
                pol_cls = NEGATIVE_LBL
            else:
                continue
            ret.append((w, pol_cls, pol_score))
    ret.sort(key=lambda el: abs(el[-1]), reverse=True)
    return ret
示例#17
0
 def check_in_seeds(a_form, a_lemma, a_seeds, a_seed_re):
     if a_seed_re.search(a_form) or a_seed_re.search(a_lemma) \
        or a_form in a_seeds or normalize(a_form) in a_seeds \
        or a_lemma in a_seeds:
         return True
     return False
示例#18
0
 def check_in_seeds(a_form, a_lemma, a_seeds, a_seed_re):
     if a_seed_re.search(a_form) or a_seed_re.search(a_lemma) \
        or a_form in a_seeds or normalize(a_form) in a_seeds \
        or a_lemma in a_seeds:
         return True
     return False
示例#19
0
import argparse
import codecs
import re
import string
import sys

##################################################################
# Variables and Constants
VERBOSE = False
PUNCT_RE = re.compile(r"^(?:" + '|'.join([re.escape(c) for c in string.punctuation]) + \
                          ")+$")
ESC_CHAR = ""
ENCODING = "utf-8"
CC_RELATIONS = set(["CD", "CJ"])
ADVERS_CC = set([normalize(w) for w in ["aber"]])
COORD_CC = set([normalize(w) for w in ["und", "oder", ","]])

##################################################################
# Methods
def _find_roots(a_tree):
    """
    Find roots of DG tree

    @param a_tree - DG tree to process

    @return list of root indices
    """
    for inode in a_tree:
        if inode.phead == '0':
            yield inode
示例#20
0
 def add_seeds(seeds, label):
     for iterm in seeds:
         iterm = normalize(iterm)
         if iterm in w2i:
             X.append(w2i[iterm])
             Y.append(label)
示例#21
0
import argparse
import codecs
import re
import string
import sys

##################################################################
# Variables and Constants
VERBOSE = False
PUNCT_RE = re.compile(r"^(?:" + '|'.join([re.escape(c) for c in string.punctuation]) + \
                          ")+$")
ESC_CHAR = ""
ENCODING = "utf-8"
CC_RELATIONS = set(["CD", "CJ"])
ADVERS_CC = set([normalize(w) for w in ["aber"]])
COORD_CC = set([normalize(w) for w in ["und", "oder", ","]])


##################################################################
# Methods
def _find_roots(a_tree):
    """
    Find roots of DG tree

    @param a_tree - DG tree to process

    @return list of root indices
    """
    for inode in a_tree:
        if inode.phead == '0':
示例#22
0
def _read_files(a_crp_files,
                a_pos,
                a_neg,
                a_pos_re=NONMATCH_RE,
                a_neg_re=NONMATCH_RE):
    """Read corpus files and populate one-directional co-occurrences.

    @param a_crp_files - files of the original corpus
    @param a_pos - initial set of positive terms
    @param a_neg - initial set of negative terms
    @param a_pos_re - regular expression for matching positive terms
    @param a_neg_re - regular expression for matching negative terms

    @return (max_vecid, word2vecid, tok_stat)

    @note constructs statistics in place

    """
    print("Reading corpus...", end="", file=sys.stderr)
    i = 0
    prev_lemmas = []
    tok_stat = Counter()
    word2cnt = Counter()
    iform = itag = ilemma = ""
    for ifname in a_crp_files:
        with codecs.open(ifname, 'r', ENCODING) as ifile:
            for iline in ifile:
                iline = iline.strip().lower()
                if not iline or SENT_END_RE.match(iline) \
                   or iline[0] == ESC_CHAR:
                    if FASTMODE and prev_lemmas:
                        i += 1
                        if i > 300:
                            break
                    if prev_lemmas:
                        del prev_lemmas[:]
                    continue
                try:
                    iform, itag, ilemma = TAB_RE.split(iline)
                except:
                    print("Invalid line format at line: {:s}".format(
                        repr(iline)),
                          file=sys.stderr)
                    continue
                ilemma = normalize(ilemma)
                if a_pos_re.search(iform) or a_neg_re.search(iform) \
                   or a_pos_re.search(ilemma) or a_neg_re.search(ilemma):
                    pass
                elif itag[:2] not in INFORMATIVE_TAGS \
                        or not check_word(ilemma):
                    continue
                word2cnt[ilemma] += 1
                for plemma in prev_lemmas:
                    tok_stat[(plemma, ilemma)] += 1
                while len(prev_lemmas) > TOK_WINDOW:
                    prev_lemmas.pop(0)
                prev_lemmas.append(ilemma)
        del prev_lemmas[:]
    print(" done", file=sys.stderr)
    max_vecid = 0
    word2vecid = {}
    # convert words to vector ids if their counters are big enough
    for w, cnt in word2cnt.iteritems():
        if cnt >= MIN_TOK_CNT or w in a_pos or w in a_neg:
            word2vecid[w] = max_vecid
            max_vecid += 1
    word2cnt.clear()
    # convert words to vector ids in context counter
    tok_stat = {(word2vecid[w1], word2vecid[w2]): cnt
                for (w1, w2), cnt in tok_stat.iteritems()
                if w1 in word2vecid and w2 in word2vecid and cnt >= MIN_TOK_CNT
                }
    return (max_vecid, word2vecid, tok_stat)
示例#23
0
def _read_files(a_crp_files, a_pos, a_neg,
                a_pos_re=NONMATCH_RE, a_neg_re=NONMATCH_RE):
    """Read corpus files and populate one-directional co-occurrences.

    @param a_crp_files - files of the original corpus
    @param a_pos - initial set of positive terms
    @param a_neg - initial set of negative terms
    @param a_pos_re - regular expression for matching positive terms
    @param a_neg_re - regular expression for matching negative terms

    @return (max_vecid, word2vecid, tok_stat)

    @note constructs statistics in place

    """
    print("Reading corpus...", end="", file=sys.stderr)
    i = 0
    prev_lemmas = []
    tok_stat = Counter()
    word2cnt = Counter()
    iform = itag = ilemma = ""
    for ifname in a_crp_files:
        with codecs.open(ifname, 'r', ENCODING) as ifile:
            for iline in ifile:
                iline = iline.strip().lower()
                if not iline or SENT_END_RE.match(iline) \
                   or iline[0] == ESC_CHAR:
                    if FASTMODE and prev_lemmas:
                        i += 1
                        if i > 300:
                            break
                    if prev_lemmas:
                        del prev_lemmas[:]
                    continue
                try:
                    iform, itag, ilemma = TAB_RE.split(iline)
                except:
                    print("Invalid line format at line: {:s}".format(
                        repr(iline)), file=sys.stderr
                    )
                    continue
                ilemma = normalize(ilemma)
                if a_pos_re.search(iform) or a_neg_re.search(iform) \
                   or a_pos_re.search(ilemma) or a_neg_re.search(ilemma):
                    pass
                elif itag[:2] not in INFORMATIVE_TAGS \
                        or not check_word(ilemma):
                    continue
                word2cnt[ilemma] += 1
                for plemma in prev_lemmas:
                    tok_stat[(plemma, ilemma)] += 1
                while len(prev_lemmas) > TOK_WINDOW:
                    prev_lemmas.pop(0)
                prev_lemmas.append(ilemma)
        del prev_lemmas[:]
    print(" done", file=sys.stderr)
    max_vecid = 0
    word2vecid = {}
    # convert words to vector ids if their counters are big enough
    for w, cnt in word2cnt.iteritems():
        if cnt >= MIN_TOK_CNT or w in a_pos or w in a_neg:
            word2vecid[w] = max_vecid
            max_vecid += 1
    word2cnt.clear()
    # convert words to vector ids in context counter
    tok_stat = {(word2vecid[w1], word2vecid[w2]): cnt
                for (w1, w2), cnt in tok_stat.iteritems()
                if w1 in word2vecid and w2 in word2vecid
                and cnt >= MIN_TOK_CNT
                }
    return (max_vecid, word2vecid, tok_stat)
示例#24
0
 def add_seeds(seeds, label):
     for iterm in seeds:
         iterm = normalize(iterm)
         if iterm in w2i:
             X.append(w2i[iterm])
             Y.append(label)