Exemplo n.º 1
0
def preprocess_for_brown_clustering():

    input_filename = dirs.data_processed_text_file
    articles = fh.read_json(input_filename)
    keys = articles.keys()
    keys.sort()

    items = keys

    print len(items)

    processed_dict = {}
    output_filename = fh.make_filename(dirs.data_processed_brown_dir, 'input', 'txt')

    with codecs.open(output_filename, 'w', encoding='utf-8') as output_file:
        for k in keys:
            text = articles[k]
            tokens = []
            sentences = text.split('\n')
            for s in sentences:
                sent_tokens = tokenizer.split_into_words(s, reattach=False, split_off_quotes=False,
                                                         lemmatize=False, replace_numbers=True)
                tokens = tokens + sent_tokens
            if k in items:
                output_file.write(' '.join(tokens) + '\n')
            processed_dict[k] = tokens

    output_filename = fh.make_filename(dirs.data_processed_brown_dir, 'processed', 'json')
    fh.write_to_json(processed_dict, output_filename)
Exemplo n.º 2
0
def run_pipeline(skip_corenlp=True, corenlp_dir=None, overwrite=False, extension='.xml', nice=False):

    output_dir = fh.makedirs(dirs.data_stanford_dir)
    temp_dir = fh.makedirs(dirs.data_raw_sentences_dir)
    xml_dir = fh.makedirs(output_dir, 'xml')

    # now done by preprocessing tools
    """
    # part 1
    print "Splitting files"
    split_into_files(input_filename, temp_dir)
    """

    # part 2
    if not skip_corenlp:
        filelist_filename = fh.make_filename(output_dir, 'filelist', 'txt')
        text_files = glob.glob(os.path.join(temp_dir, '*.txt'))
        text_files.sort()
        files = []
        if overwrite:
            print "Reprocessing all files"
            files = text_files
        else:
            for f in text_files:
                basename = os.path.basename(f)
                if not os.path.exists(os.path.join(xml_dir, basename + extension)):
                    files.append(f)
        print len(files), "files to process"

        with open(filelist_filename, 'w') as output_file:
            for f in files:
                output_file.write(f + '\n')

        if len(files) > 0:
            properties_file = os.path.join(os.getcwd(), 'core', 'external', 'CoreNLP.properties')
            print "Calling corenlp"
            call_corenlp(filelist_filename, xml_dir, corenlp_dir, properties_file, nice)

    # part 3
    print "Parsing xml"
    xml_filelist_filename = fh.make_filename(output_dir, 'xml_filelist', 'txt')

    files = glob.glob(os.path.join(xml_dir, '*.txt' + extension))
    with open(xml_filelist_filename, 'w') as output_file:
        for f in files:
            output_file.write(f + '\n')
    summary, dependencies = parse_xml_files(xml_filelist_filename, output_dir)

    # part 4
    print "Writing summary"
    #parsed_filename = fh.make_filename(output_dir, 'parsed', 'json')
    parse_summary_to_files(summary, dependencies, output_dir)
Exemplo n.º 3
0
def main():

    usage = "%prog exp_dir_test_fold_dir"
    parser = OptionParser(usage=usage)

    parser.add_option('-t', dest='test_fold', default=0,
                      help='Test fold; default=%default')

    (options, args) = parser.parse_args()
    test_fold = options.test_fold
    exp_dir = args[0]

    results = pd.DataFrame(columns=('masked', 'test', 'valid', 'dir'))

    run_dirs = glob.glob(os.path.join(exp_dir, 'bayes*reuse*'))
    for i, dir in enumerate(run_dirs):
        run_num = int(fh.get_basename_wo_ext(dir).split('_')[-1])

        if run_num <= 40 and '1_' not in fh.get_basename_wo_ext(dir):
            results_dir = os.path.join(dir, 'results')
            test_file = fh.make_filename(results_dir, 'test_macro_f1', 'csv')
            valid_file = fh.make_filename(results_dir, 'valid_cv_macro_f1', 'csv')
            masked_valid_file = fh.make_filename(results_dir, 'masked_valid_cv_macro_f1', 'csv')

            try:
                test = pd.read_csv(test_file, header=False, index_col=0)
                valid = pd.read_csv(valid_file, header=False, index_col=0)
                masked_valid = pd.read_csv(masked_valid_file, header=False, index_col=0)

                #results.loc[run_num, 'iteration'] = run_num
                results.loc[i, 'masked'] = masked_valid['overall'].mean()
                results.loc[i, 'test'] = test['overall'].mean()
                results.loc[i, 'valid'] = valid['overall'].mean()
                results.loc[i, 'dir'] = fh.get_basename_wo_ext(dir)
            except:
                continue

    results.to_csv(fh.make_filename(exp_dir, 'summary', 'csv'), columns=results.columns)

    sorted = results.sort('valid')
    print sorted

    print "best by masked"
    sorted = results.sort('masked')
    print sorted.values[-1, :]

    print "best by valid"
    sorted = results.sort('valid')
    print sorted.values[-1, :]
Exemplo n.º 4
0
def main():

    usage = "%prog exp_dir_test_fold_dir"
    parser = OptionParser(usage=usage)

    parser.add_option("-t", dest="test_fold", default=0, help="Test fold; default=%default")

    (options, args) = parser.parse_args()
    test_fold = options.test_fold
    exp_dir = args[0]

    results = pd.DataFrame(columns=("masked", "test", "valid", "dir"))

    run_dirs = glob.glob(os.path.join(exp_dir, "bayes*reuse*"))
    for i, dir in enumerate(run_dirs):
        run_num = int(fh.get_basename(dir).split("_")[-1])

        if run_num <= 40 and "1_" not in fh.get_basename(dir):
            results_dir = os.path.join(dir, "results")
            test_file = fh.make_filename(results_dir, "test_macro_f1", "csv")
            valid_file = fh.make_filename(results_dir, "valid_cv_macro_f1", "csv")
            masked_valid_file = fh.make_filename(results_dir, "masked_valid_cv_macro_f1", "csv")

            try:
                test = pd.read_csv(test_file, header=False, index_col=0)
                valid = pd.read_csv(valid_file, header=False, index_col=0)
                masked_valid = pd.read_csv(masked_valid_file, header=False, index_col=0)

                # results.loc[run_num, 'iteration'] = run_num
                results.loc[i, "masked"] = masked_valid["overall"].mean()
                results.loc[i, "test"] = test["overall"].mean()
                results.loc[i, "valid"] = valid["overall"].mean()
                results.loc[i, "dir"] = fh.get_basename(dir)
            except:
                continue

    results.to_csv(fh.make_filename(exp_dir, "summary", "csv"), columns=results.columns)

    sorted = results.sort("valid")
    print sorted

    print "best by masked"
    sorted = results.sort("masked")
    print sorted.values[-1, :]

    print "best by valid"
    sorted = results.sort("valid")
    print sorted.values[-1, :]
Exemplo n.º 5
0
def split_into_files(input_filename, output_dir):
    data = fh.read_json(input_filename)

    keys = data.keys()
    keys.sort()
    filelist = []

    for key in keys:
        key = key.rstrip('\n')
        line = data[key].rstrip('\n')
        normalized_filename = os.path.join(output_dir, key + '.txt')
        filelist.append(normalized_filename)
        with codecs.open(normalized_filename, 'w', encoding='utf-8') as output_file:
            output_file.write(line)

    filelist_filename = fh.make_filename(output_dir, 'filelist', 'txt')
    fh.write_list_to_text(filelist, filelist_filename)
    return filelist_filename
Exemplo n.º 6
0
def preprocess_for_easysrl():

    input_filename = dirs.data_processed_text_file
    articles = fh.read_json(input_filename)
    keys = articles.keys()
    keys.sort()

    labeled = list(ds.get_all_documents())
    labeled.sort()

    processed_dict = {}
    output_filename = fh.make_filename(dirs.data_easysrl_dir, 'input', 'txt')

    with codecs.open(output_filename, 'w', encoding='utf-8') as output_file:
        count = 0
        for k in labeled:
            output_file.write(k + ' starts here\n')
            text = articles[k]
            paragraphs = text.split('\n\n')
            for p in paragraphs:
                sentences = tokenizer.split_sentences(p.strip())
                for s in sentences:
                    output_file.write(s.strip() + '\n')
Exemplo n.º 7
0
def main():
    exp_dir = defines.exp_dir
    exp_name = 'bayes_opt_LR_alphas_reuse'
    df = pd.DataFrame()

    basenames = ['test_acc.csv', 'test_micro_f1.csv', 'test_macro_f1.csv', 'test_pp.csv']
    rownames = ['model accuracy', 'model micro f1', 'model macro f1', 'model percent perfect']

    for i, basename in enumerate(basenames):
        rowname = rownames[i]
        files = glob.glob(os.path.join(exp_dir, '*', 'test_fold_0', exp_name, 'results', basename))
        gather_results(df, files, rowname)

    files = glob.glob(os.path.join(defines.data_raw_labels_dir, '*.csv'))
    for file in files:
        dataset = fh.get_basename(file)
        codes = labels.get_dataset_labels(dataset)
        if dataset in df.columns:
            df.loc['Number of responses', dataset] = codes.shape[0]
            df.loc['Number of labels', dataset] = codes.shape[1]

    output_dir = '/Users/dcard/Dropbox/CMU/DAP/results/'
    output_filename = fh.make_filename(output_dir, exp_name, 'csv')
    df.to_csv(output_filename)
Exemplo n.º 8
0
def parse_xml_files(xml_filelist_filename, output_dir):
    filelist = fh.read_text(xml_filelist_filename)
    parsed_files = {}
    sentiments = {}
    dependencies = {}
    dependency_tuples = {}
    entities = {}
    coref = {}
    coref_entities = {}
    coref_heads = {}
    all_groups = {}
    jk_grams = {}
    amalgram_pairs = {}
    for file in filelist:
        file = file.rstrip('\n')
        print file
        # peel off both .txt and .xml
        basename = fh.get_basename_wo_ext(fh.get_basename_wo_ext(file))
        sentences, doc_sentiments, doc_dependencies, doc_dependency_tuples, doc_entities, doc_coref, groups, _,\
            doc_coref_entities, doc_coref_heads = parse_xml_output(file)
        parsed_files[basename] = sentences
        sentiments[basename] = doc_sentiments
        dependencies[basename] = doc_dependencies
        dependency_tuples[basename] = doc_dependency_tuples
        entities[basename] = doc_entities
        coref[basename] = doc_coref
        coref_entities[basename] = doc_coref_entities
        coref_heads[basename] = doc_coref_heads

        doc_jk_grams, doc_jk_indices = find_jk_grams(sentences)
        jk_grams[basename] = doc_jk_grams

        # output documents to amalgram format
        #amalgram_dir = os.path.join(dirs.data_amalgram_dir, 'input')
        #if not os.path.exists(amalgram_dir):
        #    os.makedirs(amalgram_dir)
        tagged_sents = ['\n'.join([t['word'] + '\t' + t['POS'] for t in s]) + '\n' for s in sentences]
        # save word/tag pairs for amalgram
        tagged_sents = [[(t['word'], t['POS']) for t in s] for s in sentences]
        amalgram_pairs[basename] = tagged_sents

        # uncomment for extracting story elements...
        parsed_dir = os.path.join(output_dir, 'parsed')
        if not os.path.exists(parsed_dir):
            os.makedirs(parsed_dir)
        parsed_filename = os.path.join(parsed_dir, basename + '.json')
        fh.write_to_json(sentences, parsed_filename, sort_keys=False)

    sentiment_filename = fh.make_filename(output_dir, 'sentiments', 'json')
    fh.write_to_json(sentiments, sentiment_filename, sort_keys=False)

    dependencies_filename = fh.make_filename(output_dir, 'dependency_tuple_ids', 'json')
    fh.write_to_json(dependency_tuples, dependencies_filename, sort_keys=False)

    coref_filename = fh.make_filename(output_dir, 'entities', 'json')
    fh.write_to_json(coref, coref_filename, sort_keys=False)

    jkgrams_filename = fh.make_filename(output_dir, 'jkgrams', 'json')
    fh.write_to_json(jk_grams, jkgrams_filename, sort_keys=False)

    coref_heads_filename = fh.make_filename(output_dir, 'coref_heads', 'json')
    fh.write_to_json(coref_heads, coref_heads_filename, sort_keys=False)

    amalgram_keys = amalgram_pairs.keys()
    amalgram_keys.sort()
    amalgram_data_file = os.path.join(dirs.data_amalgram_dir, 'input.txt')
    with codecs.open(amalgram_data_file, 'w', encoding='utf-8') as output_file:
        for k in amalgram_keys:
            sents = amalgram_pairs[k]
            for s in sents:
                for p in s:
                    output_file.write(p[0] + '\t' + p[1] + '\n')
                output_file.write('\n')

    for k in amalgram_keys:
        amalgram_data_file = os.path.join(dirs.data_amalgram_dir, k + '.txt')
        with codecs.open(amalgram_data_file, 'w', encoding='utf-8') as output_file:
            sents = amalgram_pairs[k]
            for s in sents:
                for p in s:
                    output_file.write(p[0] + '\t' + p[1] + '\n')
                output_file.write('\n')

    amalgram_index_file = os.path.join(dirs.data_amalgram_dir, 'index.txt')
    with codecs.open(amalgram_index_file, 'w', encoding='utf-8') as output_file:
        for k in amalgram_keys:
            sents = amalgram_pairs[k]
            for s in sents:
                output_file.write(k + '\n')

    #all_groups_filename = fh.make_filename(output_dir, 'all_groups', 'json')
    #fh.write_to_json(all_groups, all_groups_filename)

    return parsed_files, dependencies
Exemplo n.º 9
0
def parse_summary_to_files(parsed, dependencies, output_dir):

    words = {}
    lemmas = {}
    pos = {}
    ner = {}
    word_pos = {}
    lemma_pos = {}
    word_ner = {}
    lemma_ner = {}
    dependency_links = {}
    dependency_heads = {}
    dependency_tails = {}
    dependency_tuples = {}
    dependency_pairs = {}

    dicts = [words, lemmas, pos, ner, word_pos, lemma_pos, word_ner, lemma_ner]
    dicts2 = [dependency_links, dependency_heads, dependency_tails, dependency_tuples, dependency_pairs]

    jk_gram = []
    last_tag = None

    last_ner_tag = None
    for key in parsed.keys():
        # TODO: Actually want [a*]n+[pn+]*
        quote = None
        for d in dicts:
            d[key] = []
        sentences = parsed[key]
        for s in sentences:
            for d in dicts:
                d[key].append([])
            for token in s:
                words[key][-1].append(token['word'])
                lemmas[key][-1].append(token['lemma'])
                pos[key][-1].append(token['POS'])
                word_pos[key][-1].append(token['word'] + '_' + token['POS'])
                lemma_pos[key][-1].append(token['lemma'] + '_' + token['POS'])
                if token['NER'] != 'O':
                    # if tag matches last tag, concatenate to old entires
                    if token['NER'] == last_ner_tag and len(word_ner[key][-1]) > 0:
                        word_ner[key][-1][-1] = '_'.join(word_ner[key][-1][-1].split('_')[:-1] + [token['word'], token['NER']])
                        lemma_ner[key][-1][-1] = '_'.join(lemma_ner[key][-1][-1].split('_')[:-1] + [token['lemma'], token['NER']])
                        #word_ner[key][-1].append(token['word'] + '_' + token['NER'])
                        #lemma_ner[key][-1].append(token['lemma'] + '_' + token['NER'])
                    else:
                        ner[key][-1].append(token['NER'])
                        word_ner[key][-1].append(token['word'] + '_' + token['NER'])
                        lemma_ner[key][-1].append(token['lemma'] + '_' + token['NER'])
                last_ner_tag = token['NER']

        # join the word and lemma lists into documents
        words[key] = '\n'.join([' '.join(sentence_tokens) for sentence_tokens in words[key]])
        lemmas[key] = '\n'.join([' '.join(sentence_tokens) for sentence_tokens in lemmas[key]])


    words_filename = fh.make_filename(output_dir, 'words', 'json')
    fh.write_to_json(words, words_filename, sort_keys=False)

    lemmas_filename = fh.make_filename(output_dir, 'lemmas', 'json')
    fh.write_to_json(lemmas, lemmas_filename, sort_keys=False)

    pos_filename = fh.make_filename(output_dir, 'pos', 'json')
    fh.write_to_json(pos, pos_filename, sort_keys=False)

    ner_filename = fh.make_filename(output_dir, 'ner', 'json')
    fh.write_to_json(ner, ner_filename, sort_keys=False)

    word_pos_filename = fh.make_filename(output_dir, 'word_pos', 'json')
    fh.write_to_json(word_pos, word_pos_filename, sort_keys=False)

    lemma_pos_filename = fh.make_filename(output_dir, 'lemma_pos', 'json')
    fh.write_to_json(lemma_pos, lemma_pos_filename, sort_keys=False)

    word_ner_filename = fh.make_filename(output_dir, 'word_ner', 'json')
    fh.write_to_json(word_ner, word_ner_filename, sort_keys=False)

    lemma_ner_filename = fh.make_filename(output_dir, 'lemma_ner', 'json')
    fh.write_to_json(lemma_ner, lemma_ner_filename, sort_keys=False)

    for key in dependencies.keys():
        for d in dicts2:
            d[key] = []
        sentences = dependencies[key]
        for s in sentences:
            for d in dicts2:
                d[key].append([])
            for tuple in s:
                dependency_links[key][-1].append(tuple[1])
                dependency_heads[key][-1].append(tuple[0] + '_' + tuple[1])
                dependency_tails[key][-1].append(tuple[1] + '_' + tuple[2])
                dependency_tuples[key][-1].append(tuple[0] + '_' + tuple[1] + '_' + tuple[2])
                dependency_pairs[key][-1].append(tuple[0] + '_' + tuple[2])

    dep_filename = fh.make_filename(output_dir, 'dependency_links', 'json')
    fh.write_to_json(dependency_links, dep_filename, sort_keys=False)

    dep_filename = fh.make_filename(output_dir, 'dependency_heads', 'json')
    fh.write_to_json(dependency_heads, dep_filename, sort_keys=False)

    dep_filename = fh.make_filename(output_dir, 'dependency_tails', 'json')
    fh.write_to_json(dependency_tails, dep_filename, sort_keys=False)

    dep_filename = fh.make_filename(output_dir, 'dependency_tuples', 'json')
    fh.write_to_json(dependency_tuples, dep_filename, sort_keys=False)

    dep_filename = fh.make_filename(output_dir, 'dependency_pairs', 'json')
    fh.write_to_json(dependency_pairs, dep_filename, sort_keys=False)