def reformat_data(conll_data_dir, text_only_data_dir, remove_pos):
    """
    Data formatting
    =========
    `word2vec` produces vectors for words, such as `computer`, whereas the rest of my experiments assume there are
    augmented with a PoS tag, e.g. `computer/N`. To get around that, start with a directory of conll-formatted
    files such as

    ```
    1   Anarchism   Anarchism   NNP MISC    5   nsubj
    2   is  be  VBZ O   5   cop
    3   a   a   DT  O   5   det
    4   political   political   JJ  O   5   amod
    5   philosophy  philosophy  NN  O   0   root
    ```

    and convert them to pos-augmented format (using coarse tags like Petrov's):

    ```
    Anarchism/N is/V a/DET ....
    ```
    :param conll_data_dir: input directory in CONLL format
    :param text_only_data_dir: output directory
    """
    mkdirs_if_not_exists(text_only_data_dir)
    Parallel(n_jobs=5)(delayed(_reformat_single_file)(conll_data_dir, filename, text_only_data_dir, remove_pos)
                       for filename in os.listdir(conll_data_dir))
def train_verb_tensors(svos_file, noun_vectors_file, output_filename):
    """
    Trains Verb-bar matrices, as described in Milajevs et al (EMNLP-14, §3)
    :param svos_file: file containing a list of all SVOs in unlabelled data, one per line. May contain other document
     features too. Such a file is output by `find_all_NPs.py`, which is called from `observed_vectors.py`
    :param noun_vectors_file: a vector store containing noun vectors
    :param output_filename: name of output file- must identify the noun vectors and the unlabelled corpus
    """
    mkdirs_if_not_exists(os.path.dirname(output_filename))

    v = Vectors.from_tsv(noun_vectors_file)

    with open(svos_file) as infile:
        phrases = set()
        for line in infile:
            if DocumentFeature.from_string(line.strip()).type == 'SVO':
                phrases.add(tuple(line.strip().split('_')))
    phrases = [(subj, verb, obj) for subj, verb, obj in phrases if subj in v and obj in v]
    phrases = sorted(phrases, key=itemgetter(1))
    logging.info('Found %d SVOs in list', len(phrases))

    verb_tensors = dict()
    for verb, svos in groupby(phrases, itemgetter(1)):
        svos = list(svos)
        if len(svos) < MIN_SVO_PER_VERB:
            continue
        logging.info('Training matrix for %s from %d SVOs', verb, len(svos))
        vt = np.sum(np.outer(v.get_vector(subj).A, v.get_vector(obj).A) for subj, _, obj in svos)
        verb_tensors[verb] = vt

    logging.info('Trained %d verb matrices, saving...', len(verb_tensors))
    for verb, tensor in verb_tensors.items():
        df = pd.DataFrame(tensor)
        df.to_hdf(output_filename, verb.split('/')[0], complevel=9, complib='zlib')
Exemplo n.º 3
0
def run_stanford_pipeline(data_dir, stanford_dir, java_threads=2,
                          filelistdir=""):
    """
    Process directory of text using stanford core nlp
    suite. Perform:
        - Tokenisation
        - Sentence segmentation
        - PoS tagging
        - Lemmatisation

    Output CONLL to "*data_dir*-tagged"
    """
    if not all([data_dir, stanford_dir]):
        raise ValueError("ERROR: Must specify path to data and stanford tools.")

    # Create output directory
    output_dir = "%s-tagged" % data_dir
    try:
        os.mkdir(output_dir)
    except OSError:
        pass  # Directory already exists

    # Change working directory to stanford tools
    os.chdir(stanford_dir)

    logging.info("<%s> Beginning stanford pipeline..." % current_time())

    for data_sub_dir in [name for name in os.listdir(data_dir) if
                         not name.startswith(".")]:
        # Setup output subdirectory
        output_sub_dir = os.path.join(output_dir, data_sub_dir)
        input_sub_dir = os.path.join(data_dir, data_sub_dir)
        mkdirs_if_not_exists(output_sub_dir)

        # Create list of files to be processed.
        filelist = os.path.join(filelistdir if filelistdir else stanford_dir,
                                "%s-filelist.txt" % data_sub_dir)
        _make_filelist_and_create_files(input_sub_dir, filelist, output_sub_dir)

        logging.info("<%s> Beginning stanford processing: %s" % (
            current_time(), input_sub_dir))

        # Construct stanford java command.
        stanford_cmd = ['./corenlp.sh', '-annotators',
                        'tokenize,ssplit,pos,lemma,parse',
                        '-filelist', filelist,
                        '-outputDirectory', output_sub_dir,
                        '-threads', str(java_threads), '-outputFormat', 'conll',
                        '-outputExtension', '.tagged', '-parse.maxlen', '50']

        logging.info("Running: \n" + str(stanford_cmd))

        # Run stanford script, block until complete.
        subprocess.call(stanford_cmd)
        logging.info("<%s> Stanford complete for path: %s" % (current_time(), output_sub_dir))

    logging.info("<%s> All stanford complete." % current_time())
    return output_dir
def build_full_composed_thesauri_with_baroni_and_svd(args):
    # SET UP A FEW REQUIRED PATHS

    byblo_opts, _ = parse_byblo_conf_file(args.conf)
    input_file_name = os.path.basename(byblo_opts.input)
    # INPUT 1:  DIRECTORY. Must contain a single conf file
    unigram_vectors_dir = os.path.abspath(byblo_opts.output)
    mkdirs_if_not_exists(unigram_vectors_dir)
    unigram_vectors_dir_ppmi = '%s-ppmi' % os.path.dirname(byblo_opts.output)
    mkdirs_if_not_exists(unigram_vectors_dir_ppmi)
    unigram_vectors_dir_ppmi_svd = '%s-ppmi-svd' % os.path.dirname(byblo_opts.output)
    mkdirs_if_not_exists(unigram_vectors_dir_ppmi_svd)

    # INPUT 2: A FILE, TSV, underscore-separated observed vectors for ANs and NNs
    SVD_DIMS = 100

    ngram_vectors_dir = '%s-ppmi-svd-composed' % os.path.dirname(byblo_opts.output)
    mkdirs_if_not_exists(ngram_vectors_dir)
    composer_algos = [AdditiveComposer, MultiplicativeComposer, LeftmostWordComposer,
                      VerbComposer, RightmostWordComposer]

    # EXTRACT UNIGRAM VECTORS WITH BYBLO
    if 'unigrams' in args.stages:
        calculate_unigram_vectors(os.path.abspath(args.conf), os.path.abspath(args.byblo))
    else:
        logging.warning('Skipping unigrams stage. Assuming output is at %s',
                        byblo_opts.output)

    # FEATURE REWEIGHTING- will always be performed
    if 'ppmi' in args.stages:
        _do_ppmi(_find_events_file(byblo_opts.output), unigram_vectors_dir_ppmi)

    # REDUCE DIMENSIONALITY
    # add in observed AN/NN vectors for SVD processing. Reduce both unigram vectors and observed phrase vectors
    # together and put the output into the same file
    unreduced_unigram_events_file = _find_events_file(unigram_vectors_dir_ppmi)
    # ...exp6-12/exp6.events.filtered.strings --> ...exp6-12/exp6
    reduced_file_prefix = join(unigram_vectors_dir_ppmi_svd, input_file_name)
    # only keep the most frequent types per PoS tag to speed things up
    counts = [('N', 200000), ('V', 200000), ('J', 100000), ('RB', 0), ('AN', 0), ('NN', 0)]
    if 'svd' in args.stages:
        # in this case the name exp%d-with-obs-phrases is massively misleading because
        # there aren't any obs phrase vectors
        # let's just do SVD on the unigram phrases so we can compose them simply later
        do_svd(unreduced_unigram_events_file, reduced_file_prefix,
               desired_counts_per_feature_type=counts, reduce_to=[SVD_DIMS])
    else:
        logging.warning('Skipping SVD stage. Assuming output is at %s-SVD*', reduced_file_prefix)

    # construct the names of files output by do_svd
    all_reduced_vectors = '%s-SVD%d.events.filtered.strings' % (reduced_file_prefix, SVD_DIMS)

    if 'compose' in args.stages:
        # it is OK for the first parameter to contain phrase vectors, there is explicit filtering coming up
        # the assumption is these are actually observed phrasal vectors
        compose_and_write_vectors(all_reduced_vectors,
                                  '%s-%s' % (input_file_name, SVD_DIMS),
                                  composer_algos, output_dir=ngram_vectors_dir, dense_hd5=True)
    else:
        logging.warning('Skipping composition stage. Assuming output is at %s', ngram_vectors_dir)
def run_glove():
    logging.info('Starting training')
    with temp_chdir(args.glove_dir):
        run_and_log_output('sh {} {}'.format(glove_script, unlabelled_data))

    # convert their format to ours
    df = pd.read_csv(raw_vectors_file, sep=' ', index_col=0, header=None)
    logging.info('Done training, filtering junk and converting %d vectors to Byblo-compatible format', len(df))
    # remove any shit-looking tokens, they'll get in the way later
    mask = [DocumentFeature.from_string(x).type != 'EMPTY' and 3 < len(x) < 20 for x in df.index]
    logging.info('Keeping %d entries', sum(mask))
    logging.info('Shape of vectors before filtering %r', df.shape)
    df = df[mask]
    logging.info('Shape of vectors after filtering %r', df.shape)
    cols = ['f%d' % i for i in range(df.shape[1])]
    mkdirs_if_not_exists(output_dir)
    write_vectors_to_hdf(df.values, df.index, cols, formatted_vectors_file)
Exemplo n.º 6
0
def build_thesaurus_out_of_vectors(vectors_path, out_dir, threads=4, num_neighbours=100, sim_function='Cosine'):
    """
    Builds a Byblo thesaurus out of the provided vectors, however these were constructed. This function will make an
    uncompressed copy of the provided vectors file- might be slow and use up a lot of extra space.

    :param vectors_path: input vectors in byblo format, compressed or not
    :param out_dir: where to put the thesaurus and all temp file
    :param threads: number of byblo threads
    :param num_neighbours: number of nearest neighbours per entry to output
    :param sim_function: similarity measure between vectors to use. see byblo docs
    """
    from discoutils.thesaurus_loader import Vectors

    BYBLO_BASE_DIR = '/lustre/scratch/inf/mmb28/FeatureExtractionToolkit/Byblo-2.2.0'
    vectors_path = os.path.abspath(vectors_path)
    out_dir = os.path.abspath(out_dir)
    mkdirs_if_not_exists(out_dir)
    v = Vectors.from_tsv(vectors_path)

    # prepare the files that byblo expects
    outf_basename = os.path.join(out_dir, 'input')
    events_file = os.path.join(out_dir, outf_basename + '.events.filtered.strings')
    entries_file = os.path.join(out_dir, outf_basename + '.entries.filtered.strings')
    features_file = os.path.join(out_dir, outf_basename + '.features.filtered.strings')

    v.to_plain_txt(events_file, entries_file, features_file)
    # write the byblo conf file
    conf = '--input {} --output {} --threads {} --similarity-min 0.01 -k {} ' \
           '--measure {} --stages allpairs,knn,unenumerate'.format(outf_basename, out_dir, threads,
                                                                   num_neighbours, sim_function)
    conf_path = os.path.join(out_dir, 'conf.txt')
    with open(conf_path, 'w') as outf:
        for line in conf.split():
            outf.write(line)
            outf.write('\n')

    # go baby go
    with temp_chdir(BYBLO_BASE_DIR):
        reindex_all_byblo_vectors(outf_basename)
        run_byblo(conf_path, touch_input_file=True)
        unindex_all_byblo_vectors(outf_basename)
Exemplo n.º 7
0
def run_experiment(conf):
    start_time = datetime.now()
    mkdirs_if_not_exists(conf['output_dir'])
    test_path = ''
    tr_data = conf['training_data']
    if conf['test_data']:
        test_path = conf['test_data']

    # LOADING RAW TEXT
    x_tr, y_tr, x_test, y_test = get_tokenized_data(tr_data,
                                                    get_tokenizer_settings_from_conf(conf),
                                                    test_data=test_path)

    # CREATE CROSSVALIDATION ITERATOR
    cv_iterator, y_vals = _build_crossvalidation_iterator(conf['crossvalidation'],
                                                          y_tr, y_test)
    if x_test is not None:
        # concatenate all data, the CV iterator will make sure x_test is used for testing
        x_vals = list(x_tr)
        x_vals.extend(list(x_test))
    else:
        x_vals = x_tr

    all_scores = []
    params = []
    for i, (train_idx, test_idx) in enumerate(cv_iterator):
        params.append((conf, i, multiple_scores, test_idx, train_idx, x_vals, y_vals))
        logging.warning('Only using the first CV fold')
        if conf['crossvalidation']['break_after_first']:
            # only do one train/test split to save time
            logging.info('Exiting after first fold')
            break

    scores_over_cv = [_cv_loop(*foo) for foo in params]
    all_scores.extend([score for one_set_of_scores in scores_over_cv for score in one_set_of_scores])
    _store_scores(all_scores, conf['output_dir'], conf['name'])
    total_time = (datetime.now() - start_time).seconds / 60
    logging.info('MINUTES TAKEN %.2f' % total_time)
Exemplo n.º 8
0
    def __init__(self, prefix, stage, cv_fold, n_replacements=3):
        self.token_counts = Counter()
        self.paraphrases = Counter()
        self.prefix = prefix  # store data here instead of in memory
        self.stage = stage
        self.cv_fold = cv_fold

        self.par_file = '%s.%s.csv.gz' % (self.prefix, 'par')  # paraphrases
        self.tc_file = '%s.%s.csv.gz' % (self.prefix, 'tc')  # term counts
        self.max_paraphrases = n_replacements

        mkdirs_if_not_exists(os.path.dirname(self.par_file))
        mkdirs_if_not_exists(os.path.dirname(self.tc_file))
        if cv_fold == 0 and stage == 'tr':
            # experiment just started, write header to output files
            with gzip.open(self.tc_file, 'wb') as outfile:
                outfile.write(bytes('# feature counts in labelled data\n', encoding='UTF8'))
                outfile.write(bytes('cv_fold,stage,feature,IV,IT,count\n', encoding='UTF8'))
            with gzip.open(self.par_file, 'wb') as outfile:
                outfile.write(bytes('# Replacements made at decode time\n', encoding='UTF8'))
                repl_header = ','.join('neigh{0},neigh{0}_sim'.format(i + 1) for i in range(n_replacements))
                header = 'cv_fold,stage,feature,available_replacements,%s,count\n' % repl_header
                outfile.write(bytes(header, encoding='UTF8'))
def compute_and_write_vectors(corpus_name, stages, percent, repeat, remove_pos):
    prefix = os.path.abspath(os.path.join(__file__, '..', '..'))
    output_dir = join(prefix, 'outputs', 'word2vec')
    mkdirs_if_not_exists(output_dir)

    # inputs
    conll_data_dir = join(prefix, 'data/%s-conll' % corpus_name)
    # outputs
    if remove_pos:
        text_only_data_dir = join(prefix, 'data/%s-nopos' % corpus_name)
        unigram_events_file = join(output_dir, '%s-nopos-%dperc.unigr.strings'%(corpus_name, percent))
    else:
        text_only_data_dir = join(prefix, 'data/%s' % corpus_name)
        unigram_events_file = join(output_dir, '%s-%dperc.unigr.strings'%(corpus_name, percent))

    if percent > 90 and repeat > 1:
        raise ValueError('Repeating with a different sample of corpus only makes sense when '
                         'the samples are sufficiently distinct. This requires that the sample'
                         ' size is fairly small to minimise overlap between samples')

    if 'reformat' in stages:
        reformat_data(conll_data_dir, text_only_data_dir, remove_pos)

    if 'vectors' in stages:
        models = [_train_model(percent, text_only_data_dir, i, remove_pos) for i in range(repeat)]

        vectors = []
        # write the output of each run separately
        for i in range(repeat):
            output_path = unigram_events_file + '.rep%d' % i
            vectors.append(write_gensim_vectors_to_tsv(models[i], output_path))

        if 'average' in stages and repeat > 1:
            # average vectors and append to list to be written
            shared_vocab = set.intersection(*[set(model.vocab.keys()) for model in models])
            output_path = unigram_events_file + '.avg%d' % repeat
            model = {}
            for k in shared_vocab:
                model[k] = reduce(np.add, [m[k] for m in models])
            vectors.append(write_gensim_vectors_to_tsv(model, output_path, vocab=shared_vocab))
    else:
        # let's just pretend something was written above. just need this so the loop below will run
        vectors = [None] * repeat + ([None] if 'average' in stages and repeat > 1 else [])
    if 'compose' in stages:
        for i, v in enumerate(vectors):
            # if we'll also be composing we don't have to write the unigram vectors to disk
            # just to read them back later.
            if 'average' in stages and i == (len(vectors) - 1) and len(vectors) > 1:
                # last set of vectors in the list, these are the averages ones
                out_path = 'word2vec-%s_%dpercent-avg%d' % (corpus_name, percent, repeat)
                input_thing = v if 'vectors' in stages else unigram_events_file + '.avg%d' % repeat
            else:
                out_path = 'word2vec-%s_%dpercent-rep%d' % (corpus_name, percent, i)
                input_thing = v if 'vectors' in stages else unigram_events_file + '.rep%d' % i
            row_filter = default_row_filter_nopos if remove_pos else default_row_filter
            compose_and_write_vectors(input_thing,
                                      out_path,
                                      composer_algos,
                                      output_dir=output_dir,
                                      row_filter=row_filter,
                                      remove_pos=remove_pos,
                                      dense_hd5=True)
Exemplo n.º 10
0
# the two paths below needs to point to the same thing
phrases_to_compose = os.path.join(prefix, '..', 'thesisgenerator',
                                  'features_in_labelled', 'socher.txt')
socher_input_file = os.path.join(socher_base_dir, 'parsed.txt')
plaintext_socher_input_file = os.path.join(prefix, '..', 'eval',
                                           'features_in_labelled', 'all_features.txt')

socher_output_phrases_file = os.path.join(socher_base_dir, 'phrases.txt')
socher_output_vectors_file = os.path.join(socher_base_dir, 'outVectors.txt')
socher_unigram_embedding_matlab = os.path.join(socher_base_dir, 'vars.normalized.100.mat')

# output of reformat stage
turian_unigram_vectors_file = os.path.join(socher_base_dir, 'turian_unigrams.h5')
output_dir = os.path.join(socher_base_dir, 'composed')
mkdirs_if_not_exists(output_dir)
socher_composed_vectors_file = os.path.join(output_dir, 'AN_NN_turian_Socher.events.filtered.strings')


def run_socher_code():
    # symlink the file Socher's code expects to where the list of phrases I'm interested is
    force_symlink(phrases_to_compose, socher_input_file)
    with temp_chdir(socher_base_dir):
        run_and_log_output('./phrase2Vector.sh')  # this takes a while
        # output files are phrases.txt and outVectors.txt


def reformat_socher_vectors():
    """
    Formats the files output by Socher (2011)'s matlab code into byblo-compatible files.
Exemplo n.º 11
0
def train_grefenstette_multistep_composer(all_vectors_file, root_dir):
    """
    Train Grefenstette et al's multistep regression VO/SVO model
    Adapted from dissect's ex19.py
    :param all_vectors_file: file containing N, V, VO and SVO vectors
    :param root_dir: where to write temp files and output
    """
    mkdirs_if_not_exists(root_dir)
    vo_composer_output_file = join(root_dir, 'vo_comp.pkl')
    svo_composer_output_file = join(root_dir, 'svo_comp.pkl')

    filename = basename(all_vectors_file)
    noun_events_file = join(root_dir, '%s-onlyN.tmp' % filename)
    # verb_events_file = join(root_dir, '%s-onlyV.tmp' % filename)
    # vo_events_file = join(root_dir, '%s-onlyVO.tmp' % filename)
    svo_events_file = join(root_dir, '%s-onlySVO.tmp' % filename)

    # this has unigrams and observed phrases
    thes = Vectors.from_tsv(all_vectors_file)
    thes.to_tsv(noun_events_file,
                entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N')
    _translate_byblo_to_dissect(noun_events_file)
    # thes.to_tsv(verb_events_file,
    # entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'V')
    # _translate_byblo_to_dissect(verb_events_file)
    # thes.to_tsv(vo_events_file,
    #             entry_filter=lambda x: x.type == 'VO')
    # _translate_byblo_to_dissect(vo_events_file)
    thes.to_tsv(svo_events_file,
                entry_filter=lambda x: x.type == 'SVO')
    _translate_byblo_to_dissect(svo_events_file)

    train_vo_data, train_v_data = [], []
    for phrase in thes.keys():
        df = DocumentFeature.from_string(phrase)
        if df.type == 'SVO':
            train_vo_data.append((str(df[1:]), str(df[0]), str(df)))
        if df.type == 'VO':
            train_v_data.append((str(df[0]), str(df[1]), str(df)))

    # logging.info('train_vo_data %r', len(train_vo_data))
    # logging.info('train_v_data %r', len(train_v_data))

    # load N and SVO spaces
    n_space = Space.build(data=noun_events_file + '.sm',
                          cols=noun_events_file + '.cols',
                          format="sm")

    svo_space = Space.build(data=svo_events_file + '.sm',
                            cols=svo_events_file + '.cols',
                            format="sm")

    logging.info("Input SVO training space:")
    logging.info(svo_space.id2row)
    # logging.info(svo_space.cooccurrence_matrix)

    # 1. train a model to learn VO functions on train data: VO N -> SVO
    logging.info("Step 1 training")
    vo_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2)  # Gref et al 2013, §5 says 3
    vo_model.train(train_vo_data, n_space, svo_space)
    io_utils.save(vo_model, vo_composer_output_file)

    # 2. train a model to learn V functions on train data: V N -> VO
    # where VO space: function space learned in step 1
    logging.info("Step 2 training")
    vo_space = vo_model.function_space
    v_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2)
    v_model.train(train_v_data, n_space, vo_space)
    io_utils.save(v_model, svo_composer_output_file)
Exemplo n.º 12
0
def train_baroni_guevara_composers(all_vectors,
                                   ROOT_DIR,
                                   baroni_output_path, guevara_output_path,
                                   baroni_threshold=10):
    """

    :type all_vectors: str; path to vectors file containing both N and observed AN vectors
    :type ROOT_DIR: str; where to write temp files
    :type baroni_output_path: str; where to write pickled baroni composer
    :type guevara_output_path: str
    :type baroni_threshold: int
    """
    SVD_DIMS = 100
    baroni_training_phrase_types = {'AN', 'NN'}  # what kind of NPs to train Baroni composer for

    # prepare the input files to be fed into Dissect
    mkdirs_if_not_exists(ROOT_DIR)

    filename = basename(all_vectors)
    noun_events_file = join(ROOT_DIR, '%s-onlyN-SVD%d.tmp' % (filename, SVD_DIMS))
    NPs_events_file = join(ROOT_DIR, '%s-onlyPhrases-SVD%d.tmp' % (filename, SVD_DIMS))

    thes = Vectors.from_tsv(all_vectors, lowercasing=False)
    thes.to_tsv(noun_events_file,
                entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N')
    _translate_byblo_to_dissect(noun_events_file)

    thes.to_tsv(NPs_events_file,
                entry_filter=lambda x: x.type in baroni_training_phrase_types,
                row_transform=lambda x: str(x).replace(' ', '_'))
    _translate_byblo_to_dissect(NPs_events_file)

    my_space = Space.build(data="{}.sm".format(noun_events_file),
                           rows="{}.rows".format(noun_events_file),
                           cols="{}.cols".format(noun_events_file),
                           format="sm")
    logging.info('Each unigram vector has dimensionality %r', my_space.element_shape)

    # create a peripheral space
    my_per_space = PeripheralSpace.build(my_space,
                                         data="{}.sm".format(NPs_events_file),
                                         rows="{}.rows".format(NPs_events_file),
                                         # The columns of the peripheral space have to be identical to those
                                         # in the core space (including their order)!
                                         cols="{}.cols".format(NPs_events_file),
                                         format="sm")
    logging.info('Each phrase vector has dimensionality %r', my_per_space.element_shape)

    # use the model to compose words in my_space
    all_data = []
    for phrase in my_per_space._row2id:
        # make sure there are only NPs here
        if DocumentFeature.from_string(phrase.replace(' ', '_')).type in baroni_training_phrase_types:
            adj, noun = phrase.split('_')
            all_data.append((adj, noun, '%s_%s' % (adj, noun)))

    # train a composition model on the data and save it
    baroni = LexicalFunction(min_samples=baroni_threshold, learner=RidgeRegressionLearner())
    guevara = FullAdditive(learner=RidgeRegressionLearner())
    for composer, out_path in zip([baroni, guevara],
                                  [baroni_output_path, guevara_output_path]):
        composer.train(all_data, my_space, my_per_space)
        io_utils.save(composer, out_path)
        logging.info('Saved trained composer to %s', out_path)
def _write_features_of_single_corpus_to_file(all_phrases, corpus_name):
    ALL_FEATURES_FILE = '%s/%s_all_features.txt' % (ROOT, corpus_name)
    NP_MODIFIERS_FILE = '%s/%s_np_modifiers.txt' % (ROOT, corpus_name)
    VERBS_FILE = '%s/%s_verbs.txt' % (ROOT, corpus_name)
    SOCHER_FILE = '%s/%s_socher.txt' % (ROOT, corpus_name)

    logging.info('Writing %d unique document features to files in %s', len(all_phrases), ROOT)

    # How stanford parser formats NPs and VPs
    # (ROOT
    # (NP (NN acquisition) (NN pact)))
    #
    # (ROOT
    # (NP (JJ pacific) (NN stock)))
    stanford_NP_pattern = '(ROOT\n (NP ({} {}) ({} {})))\n\n'

    # (ROOT
    # (S
    # (NP (NNS cats))
    # (VP (VBP eat)
    # (NP (NNS dogs)))))
    stanford_SVO_pattern = '(ROOT\n  (S\n    (NP (NN {}))\n    (VP (VB {})\n      (NP (NN {})))))\n\n'

    # (ROOT
    # (S
    # (VP (VB eat)
    # (NP (NNS cats)))))
    stanford_VO_pattern = '(ROOT\n  (S\n    (VP (VB {})\n      (NP (NN {})))))\n\n'

    # (ROOT
    # (NP (NN roads)))
    # I checked that this extracts the neural word embedding for the word
    stanford_unigram_pattern = '(ROOT\n (NP ({} {})))\n\n'

    mkdirs_if_not_exists(ROOT)
    logging.info('Writing all document features to files')
    seen_modifiers, seen_verbs = set(), set()

    with open(SOCHER_FILE, 'w') as outf_socher, \
            open(NP_MODIFIERS_FILE, 'w') as outf_mods, \
            open(VERBS_FILE, 'w') as outf_verbs, \
            open(ALL_FEATURES_FILE, 'w') as outf_plain:

        for item in all_phrases:
            item = DocumentFeature.from_string(item)
            # write in my underscore-separated format
            outf_plain.write(str(item) + '\n')

            if item.type in {'AN', 'NN'}:
                # write the phrase in Socher's format
                string = stanford_NP_pattern.format(item.tokens[0].pos * 2, item.tokens[0].text,
                                                    item.tokens[1].pos * 2, item.tokens[1].text)
                outf_socher.write(string)

            if item.type in {'VO', 'SVO'}:
                verb = str(item.tokens[-2])
                if verb not in seen_verbs:
                    seen_verbs.add(verb)
                    outf_verbs.write(verb)
                    outf_verbs.write('\n')

            if item.type == 'VO':
                string = stanford_VO_pattern.format(*[x.tokens[0].text for x in item])
                outf_socher.write(string)

            if item.type == 'SVO':
                string = stanford_SVO_pattern.format(*[x.tokens[0].text for x in item])
                outf_socher.write(string)

            if item.type in {'AN', 'NN'}:
                # write just the modifier separately
                first = str(item.tokens[0])
                second = str(item.tokens[1])
                if first not in seen_modifiers:
                    outf_mods.write('%s\n' % first)
                    seen_modifiers.add(first)

            if item.type == '1-GRAM':
                string = stanford_unigram_pattern.format(item.tokens[0].pos * 2, item.tokens[0].text)
                outf_socher.write(string)

            if item.type not in {'1-GRAM', 'AN', 'NN', 'VO', 'SVO'}:  # there shouldn't be any other features
                raise ValueError('Item %r has the wrong feature type: %s' % (item, item.type))
Exemplo n.º 14
0
    if not os.path.exists(arg):
        parser.error("The conf file %s does not exist!" % arg)
    else:
        return arg


if __name__ == '__main__':
    # parse command-line arguments (conf file only)
    parser = argparse.ArgumentParser(description='Evaluate vector via document classification')
    parser.add_argument('conf_file',
                        help='Conf file that defines the experiment',
                        type=is_valid_file)

    args = parser.parse_args()
    conf, configspec_file = parse_config_file(args.conf_file)
    mkdirs_if_not_exists(conf['output_dir'])

    # set up logging to file
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s\t%(module)s.%(funcName)s (line %(lineno)d)\t%(levelname)s : %(message)s",
                        datefmt='%m-%d %H:%M',
                        filename=os.path.join(conf['output_dir'], 'log.txt'),
                        filemode='w')
    # define a Handler which writes INFO messages or higher to the sys.stderr
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    # set a format which is simpler for console use
    formatter = logging.Formatter("%(asctime)s\t%(module)s.%(funcName)s (line %(lineno)d)\t%(levelname)s : %(message)s")
    # tell the handler to use this format
    console.setFormatter(formatter)
    logging.getLogger('').addHandler(console)
Exemplo n.º 15
0
def conf():
    config, _ = parse_config_file(conf_file)
    mkdirs_if_not_exists(config['output_dir'])
    return config