Python pickle_from_file примеры, data.pickle_from_file Python примеры использования

Пример #1

0

Показать файл

Файл: dependency_experiments.py Проект: kvalle/TextNet

def plot_type_evaluation():
    """
    Plot results from the :func:`evaluate_dep_types` experiment.
    """
    l = [
        'dep', 'aux', 'auxpass', 'cop', 'agent', 'acomp', 'attr', 'ccomp',
        'xcomp', 'complm', 'dobj', 'iobj', 'pobj', 'mark', 'rel', 'nsubj',
        'nsubjpass', 'csubj', 'csubjpass', 'cc', 'conj', 'expl', 'abbrev',
        'amod', 'appos', 'advcl', 'purpcl', 'det', 'predet', 'preconj',
        'infmod', 'mwe', 'partmod', 'advmod', 'neg', 'rcmod', 'quantmod',
        'tmod', 'nn', 'npadvmod', 'num', 'number', 'prep', 'poss',
        'possessive', 'prt', 'parataxis', 'punct', 'ref', 'xsubj', 'pcomp',
        'prepc'
    ]
    d = data.pickle_from_file('output/dependencies/types_eval_class')
    diffs = []
    print '--- Classification ---'
    for i, dep_type in enumerate(l):
        val = d['classification'][i]
        diff = val - 0.5750
        diffs.append(diff)
        print "\t\t\t" + dep_type + "  &  " + '%1.4f' % val + "  &  " + '%1.4f' % diff + "\\\\"

    d = data.pickle_from_file('output/dependencies/types_eval_retr')
    diffs = []
    print '--- Retrieval ---'
    for i, dep_type in enumerate(l):
        val = d['retrieval'][i]
        diff = val - 0.1985
        diffs.append(diff)
        print "\t\t\t" + dep_type + "  &  " + '%1.4f' % val + "  &  " + '%1.4f' % diff + "\\\\"

Пример #2

0

Показать файл

Файл: dependency_experiments.py Проект: kvalle/TextNet

def plot_type_evaluation():
    """
    Plot results from the :func:`evaluate_dep_types` experiment.
    """
    l = ['dep', 'aux', 'auxpass', 'cop', 'agent', 'acomp',
        'attr', 'ccomp', 'xcomp', 'complm', 'dobj', 'iobj',
        'pobj', 'mark', 'rel', 'nsubj', 'nsubjpass', 'csubj',
        'csubjpass', 'cc', 'conj', 'expl', 'abbrev', 'amod',
        'appos', 'advcl', 'purpcl', 'det', 'predet', 'preconj',
        'infmod', 'mwe', 'partmod', 'advmod', 'neg', 'rcmod',
        'quantmod', 'tmod', 'nn', 'npadvmod', 'num', 'number',
        'prep', 'poss', 'possessive', 'prt', 'parataxis',
        'punct', 'ref', 'xsubj', 'pcomp', 'prepc']
    d = data.pickle_from_file('output/dependencies/types_eval_class')
    diffs  = []
    print '--- Classification ---'
    for i, dep_type in enumerate(l):
        val = d['classification'][i]
        diff = val - 0.5750
        diffs.append(diff)
        print "\t\t\t"+dep_type+"  &  "+'%1.4f'%val+"  &  "+'%1.4f'%diff +"\\\\"

    d = data.pickle_from_file('output/dependencies/types_eval_retr')
    diffs  = []
    print '--- Retrieval ---'
    for i, dep_type in enumerate(l):
        val = d['retrieval'][i]
        diff = val - 0.1985
        diffs.append(diff)
        print "\t\t\t"+dep_type+"  &  "+'%1.4f'%val+"  &  "+'%1.4f'%diff +"\\\\"

Пример #3

0

Показать файл

Файл: dependency_experiments.py Проект: kvalle/TextNet

def print_hubs():
    """
    Print results from :func:`print_common_hub_words` as latex table.
    """
    w = data.pickle_from_file('output/dependencies/common_hubs_withstop_words')
    wo = data.pickle_from_file('output/dependencies/common_hubs_withoutstop_words')

    tasa_w = [term.encode('ascii','ignore') for term in w['tasa'][:10]]
    air_w = [term.encode('ascii','ignore') for term in w['air'][:10]]
    tasa_wo = [term.encode('ascii','ignore') for term in wo['tasa'][:10]]
    air_wo = [term.encode('ascii','ignore') for term in wo['air'][:10]]

    for i in range(10):
        print tasa_w[i],' & ',air_w[i],' & ',tasa_wo[i],' & ',air_wo[i],'\\\\'

Пример #4

0

Показать файл

Файл: co_occurrence_experiments.py Проект: kvalle/TextNet

def plot_results():
    retr_results = data.pickle_from_file('output/retr_context_10')
    retr_results = {
        'Degree (window)': [
            0.22290305491606582, 0.2239404496699994, 0.22351183191703122,
            0.22293583927185456, 0.2216027852882311, 0.22232860216650002,
            0.22230162622918934, 0.22287683186704185, 0.22266252053221772,
            0.22237418794670616
        ],
        'PageRank (window)': [
            0.21772129149181993, 0.21884861149427587, 0.22063142971295358,
            0.21893898241891538, 0.21973766615441442, 0.22054672890564322,
            0.22099589130745473, 0.22129686184085004, 0.22148942934157456,
            0.22147928890310792
        ],
        'PageRank (sentence)': [0.22056586008664569] * 10,
        'Degree (sentence)': [0.21784622825075944] * 10
    }
    #~ #'PageRank (sentence)':[0.223649757653]*10,
    #~ #'Weighted degree (sentence)':[0.223449136101]*10}
    pp.pprint(retr_results)
    plotter.plot(range(1, 11),
                 retr_results,
                 'retrieval score',
                 'n, context size',
                 '', [1, 10, .216, .225],
                 legend_place="lower right")

Пример #5

0

Показать файл

Файл: graph_representation.py Проект: himanshusapra9/TextNet

def test_co_occurrences():
    doc1 = data.read_file('../data/tasa/TASATest/Science/Agatha09.07.03.txt')
    doc2 = data.read_file('../data/tasa/TASATest_preprocessed/Science/Agatha09.07.03.txt')
    g0 = construct_cooccurrence_network(doc1, context='window', already_preprocessed=False)
    g1 = construct_cooccurrence_network(doc2, context='window', already_preprocessed=True)
    g2 = construct_cooccurrence_network(doc1, context='sentence', already_preprocessed=False)
    graphs = data.pickle_from_file('output/testdata/co-occurrence-graphs.pkl')
    assert(graph.equal(g0,graphs[0]))
    assert(graph.equal(g1,graphs[1]))
    assert(graph.equal(g2,graphs[2]))

    doc = data.read_file('output/testdata/higher.order.testdoc.preprocessed.txt')
    g1 = construct_cooccurrence_network(doc, already_preprocessed=True, window_size=1, orders=[1])
    g12 = construct_cooccurrence_network(doc, already_preprocessed=True, window_size=1, orders=[1,2])
    g123 = construct_cooccurrence_network(doc, already_preprocessed=True, window_size=1, orders=[1,2,3])
    g13 = construct_cooccurrence_network(doc, already_preprocessed=True, window_size=1, orders=[1,3])
    assert(('foo','bar') in g1.edges())
    assert(('foo','baz') not in g1.edges())
    assert(('foo','cake') not in g1.edges())
    assert(('foo','bar') in g12.edges())
    assert(('foo','baz') in g12.edges())
    assert(('foo','cake') not in g12.edges())
    assert(('foo','bar') in g123.edges())
    assert(('foo','baz') in g123.edges())
    assert(('foo','cake') in g123.edges())
    assert(('foo','baz') not in g13.edges())
    print 'ok'

Пример #6

0

Показать файл

Файл: dependency_experiments.py Проект: kvalle/TextNet

def print_hubs():
    """
    Print results from :func:`print_common_hub_words` as latex table.
    """
    w = data.pickle_from_file('output/dependencies/common_hubs_withstop_words')
    wo = data.pickle_from_file(
        'output/dependencies/common_hubs_withoutstop_words')

    tasa_w = [term.encode('ascii', 'ignore') for term in w['tasa'][:10]]
    air_w = [term.encode('ascii', 'ignore') for term in w['air'][:10]]
    tasa_wo = [term.encode('ascii', 'ignore') for term in wo['tasa'][:10]]
    air_wo = [term.encode('ascii', 'ignore') for term in wo['air'][:10]]

    for i in range(10):
        print tasa_w[i], ' & ', air_w[i], ' & ', tasa_wo[i], ' & ', air_wo[
            i], '\\\\'

Пример #7

0

Показать файл

Файл: co_occurrence_experiments.py Проект: himanshusapra9/TextNet

def plot_results():
    retr_results = data.pickle_from_file('output/retr_context_10')
    retr_results = {'Degree (window)': [0.22290305491606582,
                       0.2239404496699994,
                       0.22351183191703122,
                       0.22293583927185456,
                       0.2216027852882311,
                       0.22232860216650002,
                       0.22230162622918934,
                       0.22287683186704185,
                       0.22266252053221772,
                       0.22237418794670616],
                 'PageRank (window)': [0.21772129149181993,
                              0.21884861149427587,
                              0.22063142971295358,
                              0.21893898241891538,
                              0.21973766615441442,
                              0.22054672890564322,
                              0.22099589130745473,
                              0.22129686184085004,
                              0.22148942934157456,
                              0.22147928890310792],
                    'PageRank (sentence)': [0.22056586008664569]*10,
                    'Degree (sentence)': [0.21784622825075944]*10}
                    #~ #'PageRank (sentence)':[0.223649757653]*10,
                    #~ #'Weighted degree (sentence)':[0.223449136101]*10}
    pp.pprint(retr_results)
    plotter.plot(range(1,11),retr_results,'retrieval score','n, context size','',[1,10,.216,.225], legend_place="lower right")

Пример #8

0

Показать файл

Файл: experiments.py Проект: himanshusapra9/TextNet

def retrieval_comparison_graph(dataset='air', graph_type='co-occurrence', use_icc=False):
    """
    Experiment used for comparative evaluation of different network
    representations on retrieval.

    graph_type = 'co-occurrence' | 'dependency'

    `icc` determines whether to use _inverse corpus centrality_ in the vector representations.
    """
    def make_dicts(docs, icc=None):
        rep = []
        for i, doc in enumerate(docs):
            if i%100==0: print '    graph',str(i)+'/'+str(len(docs))
            g = gfuns[graph_type](doc)
            d = graph_representation.graph_to_dict(g, metrics[graph_type], icc)
            rep.append(d)
        return rep

    postfix = {'co-occurrence':'_text', 'dependency':'_dependencies'}
    gfuns = {'co-occurrence':graph_representation.construct_cooccurrence_network,
                'dependency':graph_representation.construct_dependency_network}
    metrics = {'co-occurrence':graph.GraphMetrics.WEIGHTED_DEGREE,
                'dependency':graph.GraphMetrics.EIGENVECTOR}

    print '--', graph_type
    print '> Reading data..', dataset
    path = '../data/'+dataset+'/problem_descriptions'+postfix[graph_type]
    docs, labels = data.read_files(path)

    print '> Creating solution representations..'
    solutions_path = '../data/'+dataset+'/solutions_preprocessed'
    solutions_texts, labels = data.read_files(solutions_path)
    solutions_rep = freq_representation.text_to_vector(solutions_texts, freq_representation.FrequencyMetrics.TF_IDF)

    icc = None
    if use_icc:
        print '> Calculating ICC..'
        m = metrics[graph_type].split()[0]
        print graph_type
        if graph_type == 'co-occurrence':
            p = 'output/centralities/co-occurrence/'+dataset+'/problem_descriptions/window/'+m+'.cent'
        elif graph_type == 'dependency':
            p = 'output/centralities/dependency/'+dataset+'/problem_descriptions/'+m+'.cent'
        print '    fetching', p
        icc = data.pickle_from_file(p)
        print '    icc:', type(icc)

    print '> Creating problem description representations..'
    dicts = make_dicts(docs, icc)
    descriptions_rep = graph_representation.dicts_to_vectors(dicts)#, remove_stop_words=True)

    print '> Evaluating..'
    results = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep)
    print results
    s = 'retrieval comparison '
    if use_icc: s += 'USING TC-ICC'
    s += '\nrepresentation: '+graph_type+'\nresult: '+str(results)+'\n\n\n'
    data.write_to_file(s, 'output/comparison/retrieval')
    return results

Пример #9

0

Показать файл

Файл: co_occurrence_experiments.py Проект: himanshusapra9/TextNet

def compare_stats_to_random(dataset):
    dataset = dataset.replace('/','.')
    stats = data.pickle_from_file('output/properties/cooccurrence/stats_tot_'+dataset)
    n = stats['# nodes_mean']
    p = stats['mean degree_mean']/(2*n)
    g = nx.directed_gnp_random_graph(int(n), p)
    props = graph.network_properties(g)
    pp.pprint(props)

Пример #10

0

Показать файл

Файл: co_occurrence_experiments.py Проект: kvalle/TextNet

def compare_stats_to_random(dataset):
    dataset = dataset.replace('/', '.')
    stats = data.pickle_from_file('output/properties/cooccurrence/stats_tot_' +
                                  dataset)
    n = stats['# nodes_mean']
    p = stats['mean degree_mean'] / (2 * n)
    g = nx.directed_gnp_random_graph(int(n), p)
    props = graph.network_properties(g)
    pp.pprint(props)

Пример #11

0

Показать файл

Файл: graph_representation.py Проект: kvalle/TextNet

def construct_cooccurrence_network(doc,
                                   window_size=2,
                                   direction='undirected',
                                   context='sentence',
                                   already_preprocessed=False,
                                   orders=[],
                                   order_weights=[1.0, 1.0, 1.0],
                                   doc_id=None,
                                   verbose=False):
    """Construct co-occurrence network from text.

    *direction* must be 'forward', 'backward' or 'undirected', while  *context*
    can be 'window' or 'sentence'.

    If *context* is 'window', *already_preprocessed* indicate whether *doc*
    already have been processed. Sentence contexts require unpreocessed *doc*s.

    Any value for *window_size* is ignored if *context* is 'sentence'.

    A DiGraph is created regardless of direction parameter, but with 'undirected',
    edges are created in both directions.
    """
    doc = _cooccurrence_preprocessing(doc, context, already_preprocessed)
    if context is 'sentence':
        matrix, term_list = _sentence_cooccurrence_matrix(
            doc, direction, verbose)
    elif context is 'window':
        matrix, term_list = _window_cooccurrence_matrix(
            doc, direction, window_size, verbose)
    g = nx.DiGraph()
    g.add_nodes_from(term_list)
    if len(orders) == 0:
        graph.add_edges_from_matrix(g, matrix, term_list)
    else:
        if doc_id is not None and os.path.exists(doc_id):
            first, second, third = data.pickle_from_file(doc_id)
        else:
            first, second, third = _higher_order_matrix(matrix.todense())
            if doc_id is not None:
                data.pickle_to_file((first, second, third), doc_id)
    if 1 in orders:
        graph.add_edges_from_matrix(g,
                                    first,
                                    term_list,
                                    rel_weight=order_weights[0])
    if 2 in orders:
        graph.add_edges_from_matrix(g,
                                    second,
                                    term_list,
                                    rel_weight=order_weights[1])
    if 3 in orders:
        graph.add_edges_from_matrix(g,
                                    third,
                                    term_list,
                                    rel_weight=order_weights[2])
    return g

Пример #12

0

Показать файл

Файл: co_occurrence_experiments.py Проект: himanshusapra9/TextNet

def store_corpus_network(corpus, context):
    print '> Constructing corpus network for', corpus
    path = '../data/'+corpus+'_text'
    store_path = 'output/giants/co-occurrence/'+corpus+'/'+context+'_graph.net'
    if data.pickle_from_file(store_path, suppress_warning=True):
        print '    already present, skipping'
        return
    texts, labels = data.read_files(path)
    gdoc = ' '.join(texts)
    giant = graph_representation.construct_cooccurrence_network(gdoc, context=context, already_preprocessed=False, verbose=True)
    print '> Serializing and saving..'
    data.pickle_to_file(giant, store_path)

Пример #13

0

Показать файл

Файл: co_occurrence_experiments.py Проект: kvalle/TextNet

def store_corpus_network(corpus, context):
    print '> Constructing corpus network for', corpus
    path = '../data/' + corpus + '_text'
    store_path = 'output/giants/co-occurrence/' + corpus + '/' + context + '_graph.net'
    if data.pickle_from_file(store_path, suppress_warning=True):
        print '    already present, skipping'
        return
    texts, labels = data.read_files(path)
    gdoc = ' '.join(texts)
    giant = graph_representation.construct_cooccurrence_network(
        gdoc, context=context, already_preprocessed=False, verbose=True)
    print '> Serializing and saving..'
    data.pickle_to_file(giant, store_path)

Пример #14

0

Показать файл

Файл: experiments.py Проект: himanshusapra9/TextNet

def print_network_props():
    """
    Prints latex table with various properties for networks created from
    texts in the datasets.
    """
    print '-- Co-occurrence'
    tasa = data.pickle_from_file('output/properties/cooccurrence/stats_tot_tasa.TASA900')
    air = data.pickle_from_file('output/properties/cooccurrence/stats_tot_air.problem_descriptions')
    for key in air.keys():
        prop, sep, mod = key.partition('_')
        if mod!='std':
            print prop,' & ',
            print '%2.3f'%tasa[prop+sep+'mean'],' & ','%2.3f'%tasa[prop+sep+'std'],' & ',
            print '%2.3f'%air[prop+sep+'mean'],' & ','%2.3f'%air[prop+sep+'std'],'\\\\'
    print
    print '-- Dependency, all types'
    air = data.pickle_from_file('output/properties/dependency/stats_tot_air.problem_descriptions')
    tasa = data.pickle_from_file('output/properties/dependency/stats_tot_tasa.TASA900')
    for key in air.keys():
        prop, sep, mod = key.partition('_')
        if mod!='std':
            print prop,' & ',
            print '%2.3f'%tasa[prop+sep+'mean'],' & ','%2.3f'%tasa[prop+sep+'std'],' & ',
            print '%2.3f'%air[prop+sep+'mean'],' & ','%2.3f'%air[prop+sep+'std'],'\\\\'

Пример #15

0

Показать файл

def print_network_props():
    """
    Prints latex table with various properties for networks created from
    texts in the datasets.
    """
    print '-- Co-occurrence'
    tasa = data.pickle_from_file(
        'output/properties/cooccurrence/stats_tot_tasa.TASA900')
    air = data.pickle_from_file(
        'output/properties/cooccurrence/stats_tot_air.problem_descriptions')
    for key in air.keys():
        prop, sep, mod = key.partition('_')
        if mod != 'std':
            print prop, ' & ',
            print '%2.3f' % tasa[prop + sep +
                                 'mean'], ' & ', '%2.3f' % tasa[prop + sep +
                                                                'std'], ' & ',
            print '%2.3f' % air[prop + sep +
                                'mean'], ' & ', '%2.3f' % air[prop + sep +
                                                              'std'], '\\\\'
    print
    print '-- Dependency, all types'
    air = data.pickle_from_file(
        'output/properties/dependency/stats_tot_air.problem_descriptions')
    tasa = data.pickle_from_file(
        'output/properties/dependency/stats_tot_tasa.TASA900')
    for key in air.keys():
        prop, sep, mod = key.partition('_')
        if mod != 'std':
            print prop, ' & ',
            print '%2.3f' % tasa[prop + sep +
                                 'mean'], ' & ', '%2.3f' % tasa[prop + sep +
                                                                'std'], ' & ',
            print '%2.3f' % air[prop + sep +
                                'mean'], ' & ', '%2.3f' % air[prop + sep +
                                                              'std'], '\\\\'

Пример #16

0

Показать файл

Файл: graph_representation.py Проект: kvalle/TextNet

def test_co_occurrences():
    doc1 = data.read_file('../data/tasa/TASATest/Science/Agatha09.07.03.txt')
    doc2 = data.read_file(
        '../data/tasa/TASATest_preprocessed/Science/Agatha09.07.03.txt')
    g0 = construct_cooccurrence_network(doc1,
                                        context='window',
                                        already_preprocessed=False)
    g1 = construct_cooccurrence_network(doc2,
                                        context='window',
                                        already_preprocessed=True)
    g2 = construct_cooccurrence_network(doc1,
                                        context='sentence',
                                        already_preprocessed=False)
    graphs = data.pickle_from_file('output/testdata/co-occurrence-graphs.pkl')
    assert (graph.equal(g0, graphs[0]))
    assert (graph.equal(g1, graphs[1]))
    assert (graph.equal(g2, graphs[2]))

    doc = data.read_file(
        'output/testdata/higher.order.testdoc.preprocessed.txt')
    g1 = construct_cooccurrence_network(doc,
                                        already_preprocessed=True,
                                        window_size=1,
                                        orders=[1])
    g12 = construct_cooccurrence_network(doc,
                                         already_preprocessed=True,
                                         window_size=1,
                                         orders=[1, 2])
    g123 = construct_cooccurrence_network(doc,
                                          already_preprocessed=True,
                                          window_size=1,
                                          orders=[1, 2, 3])
    g13 = construct_cooccurrence_network(doc,
                                         already_preprocessed=True,
                                         window_size=1,
                                         orders=[1, 3])
    assert (('foo', 'bar') in g1.edges())
    assert (('foo', 'baz') not in g1.edges())
    assert (('foo', 'cake') not in g1.edges())
    assert (('foo', 'bar') in g12.edges())
    assert (('foo', 'baz') in g12.edges())
    assert (('foo', 'cake') not in g12.edges())
    assert (('foo', 'bar') in g123.edges())
    assert (('foo', 'baz') in g123.edges())
    assert (('foo', 'cake') in g123.edges())
    assert (('foo', 'baz') not in g13.edges())
    print 'ok'

Пример #17

0

Показать файл

Файл: dependency_experiments.py Проект: kvalle/TextNet

def store_corpus_network(corpus):
    print '> Constructing corpus network for', corpus
    path = '../data/'+corpus+'_dependencies'
    store_path = 'output/giants/dependency/'+corpus+'/graph.net'
    if data.pickle_from_file(store_path, suppress_warning=True):
        print '    already present, skipping'
        return
    texts, labels = data.read_files(path)
    gdeps = {}
    for i, text in enumerate(texts):
        if i%1==0: print '   ',str(i)+'/'+str(len(texts))
        d = pickle.loads(text)
        for dep in d.keys():
            gdeps[dep] = gdeps.get(dep, []) + d[dep]
    giant = graph_representation.construct_dependency_network(gdeps,verbose=True,unpickle=False)
    print '> Serializing and saving..'
    data.pickle_to_file(giant, store_path)

Пример #18

0

Показать файл

Файл: dependency_experiments.py Проект: kvalle/TextNet

def store_corpus_network(corpus):
    print '> Constructing corpus network for', corpus
    path = '../data/' + corpus + '_dependencies'
    store_path = 'output/giants/dependency/' + corpus + '/graph.net'
    if data.pickle_from_file(store_path, suppress_warning=True):
        print '    already present, skipping'
        return
    texts, labels = data.read_files(path)
    gdeps = {}
    for i, text in enumerate(texts):
        if i % 1 == 0: print '   ', str(i) + '/' + str(len(texts))
        d = pickle.loads(text)
        for dep in d.keys():
            gdeps[dep] = gdeps.get(dep, []) + d[dep]
    giant = graph_representation.construct_dependency_network(gdeps,
                                                              verbose=True,
                                                              unpickle=False)
    print '> Serializing and saving..'
    data.pickle_to_file(giant, store_path)

Пример #19

0

Показать файл

Файл: co_occurrence_experiments.py Проект: himanshusapra9/TextNet

def store_centralities(corpus, context):
    print '> Calculating and storing centralities for', corpus
    g = retrieve_corpus_network(corpus, context)
    metrics = graph_representation.get_metrics(True, exclude_flow=True)

    for metric in metrics:
        m = metric.split()[0]
        store_path = 'output/centralities/co-occurrence/'+corpus+'/'+context+'/'+m+'.cent'
        if data.pickle_from_file(store_path, suppress_warning=True):
            print '    already present, skipping:', metric
            continue
        else:
            print '    calculating:', metric
        try:
            c = graph.centralities(g, metric)
            data.pickle_to_file(c, store_path)
        except MemoryError as e:
            print 'MemoryError :('
            data.write_to_file('MemoryError while claculating '+metric+' on '+corpus+':\n'+str(e)+'\n\n', 'output/log/errors')

Пример #20

0

Показать файл

Файл: experiments.py Проект: himanshusapra9/TextNet

def plot_sentence_lengths(datafile=None):
    """
    Function for plotting histogram of sentence lengths within a given dataset.
    """
    if datafile is None:
        import preprocess
        print '> reading data..'
        path = '../data/tasa/TASA900_text'
        texts, labels = data.read_files(path)
        sentence_lengths = []
        print '> counting lengths..'
        for text in texts:
            sentences = preprocess.tokenize_sentences(text)
            for sentence in sentences:
                tokens = preprocess.tokenize_tokens(sentence)
                sentence_lengths.append(len(tokens))
        data.pickle_to_file(sentence_lengths, 'output/tasa_sentence_lengths.pkl')
    else:
        sentence_lengths = data.pickle_from_file(datafile)
    plotter.histogram(sentence_lengths, 'sentence length (tokens)', '# sentences', bins=70)

Пример #21

0

Показать файл

Файл: co_occurrence_experiments.py Проект: kvalle/TextNet

def store_centralities(corpus, context):
    print '> Calculating and storing centralities for', corpus
    g = retrieve_corpus_network(corpus, context)
    metrics = graph_representation.get_metrics(True, exclude_flow=True)

    for metric in metrics:
        m = metric.split()[0]
        store_path = 'output/centralities/co-occurrence/' + corpus + '/' + context + '/' + m + '.cent'
        if data.pickle_from_file(store_path, suppress_warning=True):
            print '    already present, skipping:', metric
            continue
        else:
            print '    calculating:', metric
        try:
            c = graph.centralities(g, metric)
            data.pickle_to_file(c, store_path)
        except MemoryError as e:
            print 'MemoryError :('
            data.write_to_file(
                'MemoryError while claculating ' + metric + ' on ' + corpus +
                ':\n' + str(e) + '\n\n', 'output/log/errors')

Пример #22

0

Показать файл

Файл: graph_representation.py Проект: himanshusapra9/TextNet

def construct_cooccurrence_network(doc, window_size=2, direction='undirected', context='sentence', already_preprocessed=False, orders=[], order_weights=[1.0,1.0,1.0],doc_id=None,verbose=False):
    """Construct co-occurrence network from text.

    *direction* must be 'forward', 'backward' or 'undirected', while  *context*
    can be 'window' or 'sentence'.

    If *context* is 'window', *already_preprocessed* indicate whether *doc*
    already have been processed. Sentence contexts require unpreocessed *doc*s.

    Any value for *window_size* is ignored if *context* is 'sentence'.

    A DiGraph is created regardless of direction parameter, but with 'undirected',
    edges are created in both directions.
    """
    doc = _cooccurrence_preprocessing(doc, context, already_preprocessed)
    if context is 'sentence':
        matrix, term_list = _sentence_cooccurrence_matrix(doc, direction, verbose)
    elif context is 'window':
        matrix, term_list = _window_cooccurrence_matrix(doc, direction, window_size, verbose)
    g = nx.DiGraph()
    g.add_nodes_from(term_list)
    if len(orders)==0:
        graph.add_edges_from_matrix(g, matrix, term_list)
    else:
        if doc_id is not None and os.path.exists(doc_id):
            first, second, third = data.pickle_from_file(doc_id)
        else:
            first, second, third = _higher_order_matrix(matrix.todense())
            if doc_id is not None:
                data.pickle_to_file((first,second,third), doc_id)
    if 1 in orders:
        graph.add_edges_from_matrix(g, first, term_list, rel_weight=order_weights[0])
    if 2 in orders:
        graph.add_edges_from_matrix(g, second, term_list, rel_weight=order_weights[1])
    if 3 in orders:
        graph.add_edges_from_matrix(g, third, term_list, rel_weight=order_weights[2])
    return g

Пример #23

0

Показать файл

def plot_sentence_lengths(datafile=None):
    """
    Function for plotting histogram of sentence lengths within a given dataset.
    """
    if datafile is None:
        import preprocess
        print '> reading data..'
        path = '../data/tasa/TASA900_text'
        texts, labels = data.read_files(path)
        sentence_lengths = []
        print '> counting lengths..'
        for text in texts:
            sentences = preprocess.tokenize_sentences(text)
            for sentence in sentences:
                tokens = preprocess.tokenize_tokens(sentence)
                sentence_lengths.append(len(tokens))
        data.pickle_to_file(sentence_lengths,
                            'output/tasa_sentence_lengths.pkl')
    else:
        sentence_lengths = data.pickle_from_file(datafile)
    plotter.histogram(sentence_lengths,
                      'sentence length (tokens)',
                      '# sentences',
                      bins=70)

Пример #24

0

Показать файл

Файл: co_occurrence_experiments.py Проект: kvalle/TextNet

def retrieve_centralities(corpus, context, metric):
    m = metric.split()[0]
    path = 'output/centralities/co-occurrence/' + corpus + '/' + context + '/' + m + '.cent'
    print '    retrieving', path
    return data.pickle_from_file(path)

Пример #25

0

Показать файл

Файл: co_occurrence_experiments.py Проект: kvalle/TextNet

def retrieve_corpus_network(corpus, context):
    path = 'output/giants/co-occurrence/' + corpus + '/' + context + '_graph.net'
    return data.pickle_from_file(path)

Пример #26

0

Показать файл

Файл: dependency_experiments.py Проект: kvalle/TextNet

def retrieve_corpus_network(corpus):
    path = 'output/giants/dependency/' + corpus + '/graph.net'
    return data.pickle_from_file(path)

Пример #27

0

Показать файл

Файл: dependency_experiments.py Проект: kvalle/TextNet

def retrieve_corpus_network(corpus):
    path = 'output/giants/dependency/'+corpus+'/graph.net'
    return data.pickle_from_file(path)

Пример #28

0

Показать файл

def retrieval_comparison_graph(dataset='air',
                               graph_type='co-occurrence',
                               use_icc=False):
    """
    Experiment used for comparative evaluation of different network
    representations on retrieval.

    graph_type = 'co-occurrence' | 'dependency'

    `icc` determines whether to use _inverse corpus centrality_ in the vector representations.
    """
    def make_dicts(docs, icc=None):
        rep = []
        for i, doc in enumerate(docs):
            if i % 100 == 0: print '    graph', str(i) + '/' + str(len(docs))
            g = gfuns[graph_type](doc)
            d = graph_representation.graph_to_dict(g, metrics[graph_type], icc)
            rep.append(d)
        return rep

    postfix = {'co-occurrence': '_text', 'dependency': '_dependencies'}
    gfuns = {
        'co-occurrence': graph_representation.construct_cooccurrence_network,
        'dependency': graph_representation.construct_dependency_network
    }
    metrics = {
        'co-occurrence': graph.GraphMetrics.WEIGHTED_DEGREE,
        'dependency': graph.GraphMetrics.EIGENVECTOR
    }

    print '--', graph_type
    print '> Reading data..', dataset
    path = '../data/' + dataset + '/problem_descriptions' + postfix[graph_type]
    docs, labels = data.read_files(path)

    print '> Creating solution representations..'
    solutions_path = '../data/' + dataset + '/solutions_preprocessed'
    solutions_texts, labels = data.read_files(solutions_path)
    solutions_rep = freq_representation.text_to_vector(
        solutions_texts, freq_representation.FrequencyMetrics.TF_IDF)

    icc = None
    if use_icc:
        print '> Calculating ICC..'
        m = metrics[graph_type].split()[0]
        print graph_type
        if graph_type == 'co-occurrence':
            p = 'output/centralities/co-occurrence/' + dataset + '/problem_descriptions/window/' + m + '.cent'
        elif graph_type == 'dependency':
            p = 'output/centralities/dependency/' + dataset + '/problem_descriptions/' + m + '.cent'
        print '    fetching', p
        icc = data.pickle_from_file(p)
        print '    icc:', type(icc)

    print '> Creating problem description representations..'
    dicts = make_dicts(docs, icc)
    descriptions_rep = graph_representation.dicts_to_vectors(
        dicts)  #, remove_stop_words=True)

    print '> Evaluating..'
    results = evaluation.evaluate_retrieval(descriptions_rep, solutions_rep)
    print results
    s = 'retrieval comparison '
    if use_icc: s += 'USING TC-ICC'
    s += '\nrepresentation: ' + graph_type + '\nresult: ' + str(
        results) + '\n\n\n'
    data.write_to_file(s, 'output/comparison/retrieval')
    return results

Пример #29

0

Показать файл

Файл: co_occurrence_experiments.py Проект: himanshusapra9/TextNet

def retrieve_centralities(corpus, context, metric):
    m = metric.split()[0]
    path = 'output/centralities/co-occurrence/'+corpus+'/'+context+'/'+m+'.cent'
    print '    retrieving',path
    return data.pickle_from_file(path)

Пример #30

0

Показать файл

Файл: co_occurrence_experiments.py Проект: himanshusapra9/TextNet

def retrieve_corpus_network(corpus, context):
    path = 'output/giants/co-occurrence/'+corpus+'/'+context+'_graph.net'
    return data.pickle_from_file(path)

Пример #31

0

Показать файл

Файл: dependency_experiments.py Проект: kvalle/TextNet

def retrieve_centralities(corpus, metric):
    m = metric.split()[0]
    path = 'output/centralities/dependency/' + corpus + '/' + m + '.cent'
    print '    retrieving', path
    return data.pickle_from_file(path)

Пример #32

0

Показать файл

Файл: dependency_experiments.py Проект: kvalle/TextNet

def retrieve_centralities(corpus, metric):
    m = metric.split()[0]
    path = 'output/centralities/dependency/'+corpus+'/'+m+'.cent'
    print '    retrieving',path
    return data.pickle_from_file(path)

Python pickle_from_file примеры использования