Exemplo n.º 1
0
    def __init__(self, training_data_name="_wnut_and_hege",
                 train_files=[(TRAIN_FILE, "utf-8"), (HEGE_TRAIN_FILE, "utf-8")]):
        model_dir = os.path.join(MODEL_DIR, "python%s" % sys.version_info.major)
        with open(os.path.join(model_dir, TWITTER_NER_MODEL_FILE % training_data_name), "rb") as pickle_file:
            self.model = pickle.load(pickle_file)

        self.dict_features = DictionaryFeatures(DICTIONARY_DIR)

        all_sequences = load_sequences(DEV_FILE)
        for (train_file, encoding) in train_files:
            all_sequences.extend(load_sequences(train_file, sep="\t", encoding=encoding))
        all_tokens = [[t[0] for t in seq] for seq in all_sequences]

        if not os.path.exists(WORDVEC_FILE_PROCESSED):
            process_glovevectors(WORDVEC_FILE_RAW)
        self.word2vec_model = WordVectors(all_tokens, WORDVEC_FILE_PROCESSED)

        gimple_brown_cf = ClusterFeatures(GIMPLE_TWITTER_BROWN_CLUSTERS_DIR, cluster_type="brown")
        gimple_brown_cf.set_cluster_file_path(GIMPLE_TWITTER_BROWN_CLUSTERS_DIR)
        self.gimple_brown_clusters = gimple_brown_cf.read_clusters()

        test_enriched_data_brown_cluster_dir = TEST_ENRICHED_DATA_BROWN_CLUSTER_DIR % training_data_name
        test_enriched_data_brown_cf = ClusterFeatures(test_enriched_data_brown_cluster_dir,
                                                      cluster_type="brown", n_clusters=100)
        test_enriched_data_brown_cf.set_cluster_file_path()
        self.test_enriched_data_brown_clusters = test_enriched_data_brown_cf.read_clusters()

        test_enriched_data_clark_cluster_dir = TEST_ENRICHED_DATA_CLARK_CLUSTER_DIR % training_data_name
        test_enriched_data_clark_cf = ClusterFeatures(test_enriched_data_clark_cluster_dir,
                                                      cluster_type="clark", n_clusters=32)
        test_enriched_data_clark_cf.set_cluster_file_path()
        self.test_enriched_data_clark_clusters = test_enriched_data_clark_cf.read_clusters()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-v', '--version', action='store_true',
                        help='print versions')
    parser.add_argument('sequencefile',
                        help='a FASTQ file full of sequences')
    args = parser.parse_args()
    
    if args.version:
        print('Using version', utils.version, 'of utils.py')

    seqs = utils.load_sequences(args.sequencefile)

    tracking = []
    for seq in seqs:
        n_bases, n_ns = utils.count_Ns(seq)
        tracking.append(n_ns)

    num_with_zero = 0
    num_with_one = 0
    num_with_more = 0

    for count in tracking:
        if count == 0:
            num_with_zero += 1
        elif count == 1:
            num_with_one += 1
        else:
            num_with_more += 1

    print(num_with_zero, num_with_one, num_with_more)
    print(sum(tracking))
def main():
    print('Loading sequences from', sys.argv[1])
    seqs = utils.load_sequences(sys.argv[1])

    count = 0
    total_bases = 0
    total_ns = 0
    for seq in seqs:
        (n_bases, n_ns) = utils.count_Ns(seq)
        total_bases += n_bases
        total_ns += n_ns
        count += 1

    print("total sequences", count)
    print("fraction of bases that are ns:", total_ns / total_bases)
Exemplo n.º 4
0
import re

import io
from pathlib import Path

sns.set_context("poster")
sns.set_style("ticks")

# In[2]:

TRAIN_CORPUS = "data/conll2000/train.txt"
TEST_CORPUS = "data/conll2000/test.txt"

# In[3]:

train_corpus = load_sequences(TRAIN_CORPUS, sep=" ", col_ids=(0, -1))
train_corpus, dev_corpus = train_corpus[100:], train_corpus[:100]
print("Total items in train corpus: %s" % len(train_corpus))
print("Total items in dev corpus: %s" % len(dev_corpus))
test_corpus = load_sequences(TEST_CORPUS, sep=" ", col_ids=(0, -1))
print("Total items in test corpus: %s" % len(test_corpus))

# In[4]:

train_corpus[0]

# In[5]:


def create_vocab(data, vocabs, char_vocab, word_idx=0):
    n_vocabs = len(vocabs)