示例#1
0
    def __init__(self, loc=MOHX_LOCATION):
        self.instances, self.words = [], []

        c = 0
        for line in open(loc).readlines()[1:]:
            sentence = Corpus.Sentence()
            data = line.split(",")
            sentence.id = str(c)
            c += 1
            word_data = data[3].split()

            for i in range(len(word_data)):
                met = "N"
                if i == int(data[-2]):
                    met = "tag-" + data[-1].strip()
                w = Corpus.Word(text=word_data[i],
                                met=met,
                                sentence=sentence,
                                index=i)
                sentence.words.append(w)
                self.words.append(w)

            self.instances.append(sentence)

        Corpus.add_dependencies(self.instances, MOHX_DEPS, lex_field=1)
def do_filter(sample_url_path, corpus_path, sample_corpus_path):
    import Corpus
    name_set = set(
        map(lambda line: line.strip().split()[0].split('/')[-1],
            open(sample_url_path).readlines()))
    trec_reader = Corpus.TRECReader()
    trec_reader.open(corpus_path)
    trec_writer = Corpus.TRECWriter(sample_corpus_path)
    doc = trec_reader.next()
    start_title_tag = '<title>'
    start_title_tag_len = len(start_title_tag)
    end_title_tag = '</title>'
    count = 0
    while doc:
        text = doc.text
        start = text.find(start_title_tag)
        end = text.find(end_title_tag)
        title = ''
        if start >= 0 and end >= 0:
            title = text[start + start_title_tag_len:end]
        if name_set.__contains__(title):
            trec_writer.write(doc)
            count += 1
            if count % 1000 == 0:
                print count
        doc = trec_reader.next()
    trec_reader.close()
    trec_writer.close()
 def __init__(self):
     self.model = NMT_Model.NMT_Model()
     self.srcVocab = Corpus.Vocabulary()
     self.trgVocab = Corpus.Vocabulary()
     self.srcVocab.loadDict(Config.srcVocabF)
     self.trgVocab.loadDict(Config.trgVocabF)
     self.trainData = Corpus.BiCorpus(self.srcVocab, self.trgVocab,
                                      Config.trainSrcF, Config.trainTrgF)
     self.valData = Corpus.BiCorpus(self.srcVocab, self.trgVocab,
                                    Config.valSrcF, Config.valTrgF)
     self.buckets = self.trainData.getBuckets()
     self.networkBucket = {}
     self.bestValCE = 999999
     self.bestBleu = 0
     self.badValCount = 0
     self.maxBadVal = 5
     self.learningRate = Config.LearningRate
     self.inputSrc = tf.placeholder(
         tf.int32,
         shape=[Config.MaxLength, Config.BatchSize],
         name='srcInput')
     self.maskSrc = tf.placeholder(
         tf.float32,
         shape=[Config.MaxLength, Config.BatchSize],
         name='srcMask')
     self.inputTrg = tf.placeholder(
         tf.int32,
         shape=[Config.MaxLength, Config.BatchSize],
         name='trgInput')
     self.maskTrg = tf.placeholder(
         tf.float32,
         shape=[Config.MaxLength, Config.BatchSize],
         name='trgMask')
     self.optimizer = tf.train.AdamOptimizer()
     self.createBucketNetworks()
示例#4
0
def do_match(infobox_path, text_path, out_path):
    import Corpus
    import time

    print 'loading......'
    infobox = load_infobox(infobox_path)
    reader = Corpus.TRECReader()
    reader.open(text_path)
    writer = Corpus.TRECWriter(out_path)
    matcher = InfoBoxMatcher()

    t0 = time.time()
    count = 0
    doc = reader.next()
    while doc:
        text = doc.text
        lines = text.split('\n')
        newlines = lines[:3]

        title_line = lines[1]
        title_begin_index = title_line.find('>')
        title_end_index = title_line.find('<', title_begin_index + 1)
        title = ''
        if title_begin_index >= 0 and title_end_index >= 0:
            title = title_line[title_begin_index + 1:title_end_index].strip()
            if infobox.has_key(title):
                tagged_text = matcher.match(infobox[title], lines[3:])
                doc.text = '\n'.join(lines[:3]) + '\n'
                doc.text += tagged_text
                writer.write(doc)
        doc = reader.next()
        count += 1
        if count % 100 == 0:
            print count, time.time() - t0
    writer.close()
示例#5
0
    def __init__(self, threshhold=1):
        super().__init__()
        self.instances, self.words = [], []

        def merge_sents(sent1, sent2):
            for i in range(len(sent1.words)):
                if sent1.words[i].met == "N" and sent2.words[i].met != "N":
                    sent1.words[i].met = sent2.words[i].met
                if sent1.words[i].met != "N" and sent2.words[
                        i].met != "N" and sent1.words[i].met != sent2.words[
                            i].met:
                    if sent2.words[i].met not in sent1.words[i].met:
                        sent1.words[i].met += "-" + sent2.words[i].met

        lcc_data = etree.parse(LCC_LOCATION)
        instances = lcc_data.findall(".//LmInstance")
        metaphors = set()

        for instance in instances:
            metaphor = LCCMetaphor(instance)

            if metaphor.met_score >= threshhold:
                metaphors.add(metaphor)

        Corpus.add_dependencies(metaphors, LCC_DEPS, lex_field=0)
        Corpus.add_vn_parse(metaphors, LCC_VN)
        #Corpus.add_allen_parse(metaphors, "C:/Users/Kevin/PycharmProjects/metaphor/corpora/lcc_metaphor_dataset/lcc_allen.tagged")
        #Constructions.predict_constructions(metaphors)

        for met in metaphors:
            self.instances.append(met)
            self.words.extend(met.words)
        super().build_lexicon()
示例#6
0
 def __init__(self):
     self.model = RNNLM_Model.LM_Model()
     self.trainData = Corpus.MonoCorpus(Config.trgVocabF, Config.trainTrgF)
     self.valData = Corpus.MonoCorpus(Config.trgVocabF, Config.valTrgF)
     self.networkBucket = {}
     self.exampleNetwork = self.getNetwork(Config.BucketGap)
     if os.path.isfile(Config.initModelF):
         self.model.loadModel(Config.initModelF)
示例#7
0
def F1scores(y_true, y_pred):
    cs_positive_correct = 0.0
    cs_positive_total = 0.0
    cs_positive_true = 0.0

    mono_positive_correct = 0.0
    mono_positive_total = 0.0
    mono_positive_true = 0.0

    for i in range(0, len(y_pred)):
        tag_true = Corpus.globalTag(y_true[i])
        tag_pred = Corpus.globalTag(y_pred[i])

        if tag_pred == "CS":
            cs_positive_total += 1.0
        elif tag_pred == "ES" or tag_pred == "EUS":
            mono_positive_total += 1.0

        if tag_pred == "CS" and tag_true == "CS":
            cs_positive_correct += 1.0
        elif tag_pred == "ES" and tag_true == "ES":
            mono_positive_correct += 1.0
        elif tag_pred == "EUS" and tag_true == "EUS":
            mono_positive_correct += 1.0

        if tag_true == "CS":
            cs_positive_true += 1.0
        elif tag_true == "ES" or tag_true == "EUS":
            mono_positive_true += 1.0

    cs_precision = float(cs_positive_correct / cs_positive_total)
    cs_recall = float(cs_positive_correct / cs_positive_true)
    if cs_precision + cs_recall > 0:
        cs_f1 = float(2.0 * ((cs_precision * cs_recall) /
                             (cs_precision + cs_recall)))
    else:
        cs_f1 = 0.0

    mono_precision = float(mono_positive_correct / mono_positive_total)
    mono_recall = float(mono_positive_correct / mono_positive_true)
    if mono_precision + mono_recall > 0:
        mono_f1 = float(2.0 * ((mono_precision * mono_recall) /
                               (mono_precision + mono_recall)))
    else:
        mono_f1 = 0.0

    print ""
    print "Precision for code-switched tweets: %.5f" % cs_precision
    print "Recall for code-switched tweets: %.5f" % cs_recall
    print "F1-score for code-switched tweets: %.5f" % cs_f1
    print "Code-switched tagged tweets: %d" % cs_positive_total
    print "Precision for monolingual tweets: %.5f" % mono_precision
    print "Recall for monolingual tweets: %.5f" % mono_recall
    print "F1-score for monolingual tweets: %.5f" % mono_f1
    print "Monolingual tagged tweets: %d" % mono_positive_total
    def __init__(self):
        self.model = RnnlmModel.LM_Model()
        self.trainData = Corpus.MonoCorpus(Config.trgVocabF, Config.trainTrgF)
        self.valData = Corpus.MonoCorpus(Config.trgVocabF, Config.valTrgF)
        self.networkBucket = {}

        self.inputTrg = tf.placeholder(
            tf.int32, shape=[Config.MaxLength, Config.BatchSize], name='input')
        self.maskTrg = tf.placeholder(
            tf.float32,
            shape=[Config.MaxLength, Config.BatchSize],
            name='inputMask')
        self.optimizer = tf.train.AdamOptimizer()
        self.createBucketNetworks(Config.MaxLength)
示例#9
0
def trainCRF(corpus_file_name):
    X_set = []
    Y_set = []
    global options

    #Read the corpus
    CS_Corpus = open(corpus_file_name, 'rb')
    CS_Reader = csv.reader(CS_Corpus, delimiter=',', quotechar='"')
    CS_Reader.next()  #Skip first line

    lines = 0
    for row in CS_Reader:
        (X_set_part,
         Y_set_part) = TrainTweetToCRF(tweet=Corpus.getTweetTokensTags(row),
                                       token_prev_next=token_prev_next,
                                       options=options,
                                       y_set=True)
        if X_set_part and Y_set_part:
            X_set.extend(X_set_part)
            Y_set.extend(Y_set_part)
        lines += 1

    CS_Corpus.close()
    print "Tweets read: %d" % lines
    print "Train amount: %d" % lines

    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               c1=0.1,
                               c2=0.1,
                               max_iterations=100,
                               all_possible_transitions=True)
    crf.fit(X_set, Y_set)  #Train CRF

    return crf
示例#10
0
 def __init__(self):
     self.corpus = Corpus()
     self.featurematrix = []
     self.doctermmatrix = []
     
     self.classlabels = []
     self.spacename = ""
示例#11
0
def do_convert_mallet(match_path, mallet_path, tag_path, num):
    import Corpus
    reader = Corpus.TRECReader()
    reader.open(match_path)
    doc = reader.next()
    converter_type = 'token'
    converter = get_converter(converter_type)
    converter.open(mallet_path)
    tag_set = set(map(lambda s: s.strip(), open(tag_path).readlines()))
    num = int(num)

    doc_count = 0
    t0 = time.time()
    total_count = 0
    while doc:
        tagged_text = TaggedText()
        tagged_text.get_from_string(doc.text)
        convert_mallet(tagged_text, converter, tag_set)
        doc = reader.next()
        doc_count += 1
        if doc_count % 10 == 0:
            print doc_count, time.time() - t0
        if doc_count > num:
            break
    converter.close()
    reader.close()
示例#12
0
def do_count_length(in_trec, out_path):
    import Corpus
    reader = Corpus.TRECReader()
    reader.open(in_trec)
    doc = reader.next()
    count = 1;
    entry_per_file = 10000
    json_list = []
    start_time = time.time()
    with codecs.open(out_path, encoding='utf8', mode='w') as writer:
        while doc:
            length = len(doc.text)
            if '#redirect' in doc.text.lower():
                doc = reader.next()
                continue
            plain = Wiki2Plain(get_main_section(doc.text))
            text = plain.text

            body_start_pos = text.find('\n')
            if body_start_pos > 0:
                title = text[:body_start_pos]
                writer.write(u'%s\t%d\n' % (title, length))
                writer.flush()

            doc = reader.next()
    reader.close()
示例#13
0
def do_batch(in_trec, out_dir):
    import Corpus
    reader = Corpus.TRECReader()
    reader.open(in_trec)
    doc = reader.next()
    count = 1;
    entry_per_file = 10000
    json_list = []
    start_time = time.time()
    while doc:
        plain = Wiki2Plain(get_main_section(doc.text))
        text = plain.text

        body_start_pos = text.find('\n')
        if body_start_pos > 0:
            title = text[:body_start_pos]
            body = text[body_start_pos:] 
            if not title.count(':') or not re.match(invalid_title_pattern, title.split(':')[0]):
                json_list.append({'id': str(count), 'title': title.strip(), 'body': body.strip()})
                if count % entry_per_file == 0:
                    out_path = os.path.join(out_dir, str(count / entry_per_file) + '.json')
                    print('writing', out_path)
                    with codecs.open(out_path, encoding='utf-8', mode='w') as writer:
                       json.dump(json_list, writer, indent=2, ensure_ascii=False) 
                       json_list = []
                    print(count, title, time.time() - start_time)
                count += 1
        doc = reader.next()
    reader.close()
示例#14
0
def do_batch_apply(trec_path, model_dir, pattern_path, out_path, lib_dir):
    get_classpath(lib_dir)
    check_java_compile(lib_dir)
    pattern_set = set(
        map(lambda line: line.split()[0],
            open(pattern_path).readlines()))
    base_tag_trec_path = '%s.basetag' % trec_path
    command = [
        'java', '-Xms13G', '-Xmx13G', '-classpath', class_path,
        stanford_tag_program, '--batch-trec', trec_path, base_tag_trec_path
    ]
    print ' '.join(command)
    subprocess.call(command)

    t = time.time()
    reader = Corpus.TRECReader()
    reader.open(base_tag_trec_path)
    doc = reader.next()
    indecies = [0]
    ids = []
    all_tagged_text = None
    while doc:
        tagged_text = TaggedText()
        tagged_text.get_from_string('\n'.join(
            filter(lambda line: not line.startswith('<'),
                   doc.text.split('\n'))))
        if all_tagged_text:
            all_tagged_text += tagged_text
        else:
            all_tagged_text = tagged_text
        indecies.append(len(all_tagged_text))
        tagged_text = apply_tag(trec_path, tagged_text, model_dir, pattern_set)
        ids.append(doc.ID)
        doc = reader.next()
    reader.close()
    os.remove(base_tag_trec_path)

    #tagged_text = apply_tag(trec_path, all_tagged_text, model_dir, pattern_set)
    print len(tagged_text)
    writer = Corpus.TRECWriter(out_path)
    for i in xrange(len(ids)):
        doc = Corpus.Document(
            ids[i], tagged_text[indecies[i]:indecies[i + 1]].__str__())
        writer.write(doc)
    writer.close()
    global prune_t, label_t
    print time.time() - t, prune_t, label_t
示例#15
0
def score_authors(author_list, abstract):
    """
	Scores a list of authors against a given abstract
	:param author_list: A list of authors populated with papers
	:param abstract: Abstract to be scored against
	:return:
	"""
    # create corpus from query words
    docs = {}
    cachedStopWords = stopwords.words("english")
    query = TextBlob(abstract.lower())
    docs[-1] = query
    corpWords = []
    for word in query.words:
        if word not in cachedStopWords and word not in corpWords:
            corpWords.append(word)
    # construct tf-idf vectors from documents
    maxCitations = 0
    for author in author_list:
        for paper in author.papers:
            if paper.citations > maxCitations:
                maxCitations = paper.citations
            if paper.id not in docs.keys():
                docs[paper.id] = TextBlob(paper.desc.lower())
    corpus = Corpus(docs, corpWords)
    corpus.constructVectors()

    # cosine similarity
    query = corpus.scoredDocs[0].vector

    # original doc has id of -1
    for doc in corpus.scoredDocs:
        if doc.id == -1:
            query = doc.vector
    docDict = {}
    for document in corpus.scoredDocs:
        sim = cosine_sim(query, document.vector)
        document.addScore(sim)
        docDict[document.id] = sim

    for author in author_list:
        author.setCosineSimilarity(docDict)
        author.scorePapers(maxCitations)
        author.papers.sort(key=lambda paper: paper.finalScore, reverse=True)
        author.scoreAuthor()
示例#16
0
    def __init__(self):
        super().__init__()
        self.instances, self.words = [], []
        lemmatizer = WordNetLemmatizer()
        cur_verb, cluster = "", ""

        for line in open(TROFI_LOCATION).readlines():
            if re.match(r"\*\*\*[a-z]", line):
                cur_verb = line.split("***")[1]
                continue
            elif "*" in line or not line.strip():
                if "literal" in line:
                    cluster = "literal"
                elif "nonliteral" in line:
                    cluster = "nonliteral"
                continue

            sentence = Corpus.Sentence()
            data = line.strip().split("\t")
            sentence.id = data[0]

            met = ""
            if "N" in data[1]:
                met = "met"
            if "L" in data[1]:
                met = "N"
            if "U" in data[1]:
                met = "?"

            for i in range(len(data[2].split())):
                word = data[2].split()[i]
                v_lem = lemmatizer.lemmatize(word, "v")
                cur_met = "N"
                if v_lem == cur_verb:
                    cur_met = "tag-" + met
                w = Corpus.Word(text=word,
                                met=cur_met,
                                sentence=sentence,
                                index=i)
                sentence.words.append(w)
                self.words.append(w)

            self.instances.append(sentence)

        Corpus.add_dependencies(self.instances, TROFI_DEPS, lex_field=1)
示例#17
0
 def process(self, title, text):
     import Corpus
     self.count += 1
     title = title.replace(' ', '_').encode('utf8')
     text = text.encode('utf8')
     if self.name_set.__contains__(title):
         self.writer.write(Corpus.Document(str(self.id), '<title>%s</title>\n%s' % (title, text)))
         print self.count, self.id, title
         self.id += 1
示例#18
0
 def teste1N(self, diretorioSusp, nomeSusp):
     '''
     Testa um documento suspeito para todos os fontes do diretorio da classe
     '''
     corp = c.Corpus(self.diretorio)
     corp.carregarDiretorio()
     doc = self.buscarArquivo(diretorioSusp, nomeSusp)
     docsBasePlagio = corp.verificaPlagio(doc, 0.01)
     return self.salvarLogSaida(docsBasePlagio, nomeSusp)
示例#19
0
 def teste11(self, diretorioSusp, nomeSusp, nomeFonte):
     '''
      Testa um documento suspeito para um fonte cujo nome informado se encontra no diretorio da classe
     '''
     corp = c.Corpus(self.diretorio)
     docFonte = corp.carregarDoc(self.diretorio + nomeFonte, nomeSusp)
     corp.lDocumentos.anexar(docFonte)
     doc = self.carregarDoc(diretorioSusp + nomeSusp, nomeSusp)
     docsBasePlagio = corp.verificaPlagio(doc, 0.01)
     return self.salvarLogSaida(docsBasePlagio, nomeSusp)
示例#20
0
def do_batch(in_trec, out_trec):
    import Corpus
    reader = Corpus.TRECReader()
    reader.open(in_trec)
    writer = Corpus.TRECWriter(out_trec)
    doc = reader.next()
    count = 1
    while doc:
        plain = Wiki2Plain(doc.text)
        text = plain.text
        pos = text.find('\n')
        if pos > 0:
            text = '<title>%s</title>%s' % (text[:pos], text[pos:])
        doc.text = text
        writer.write(doc)
        doc = reader.next()
        if count % 1000 == 0:
            print count
        count += 1
    reader.close()
    writer.close()
示例#21
0
    def __init__(self, corpus_location):
        self.instances, self.words = [], []
        data = csv.reader(open(corpus_location))
        next(data)
        for line in data:
            sentence = Corpus.Sentence()
            sentence.id = line[1]

            index = int(line[-2])
            tag = int(line[-1])

            sent_data = line[3].split()
            for i in range(len(sent_data)):
                word = sent_data[i]
                met = "N"
                if i == index:
                    met = "met"
                w = Corpus.Word(text=word, sentence=sentence, met=met, index=i)
                sentence.words.append(w)
                self.words.append(w)

            self.instances.append(sentence)
示例#22
0
 def __init__(self):
     self.model = NMT_Model.NMT_Model()
     self.srcVocab = Corpus.Vocabulary()
     self.trgVocab = Corpus.Vocabulary()
     self.srcVocab.loadDict(Config.srcVocabF)
     self.trgVocab.loadDict(Config.trgVocabF)
     self.trainData = Corpus.BiCorpus(self.srcVocab, self.trgVocab,
                                      Config.trainSrcF, Config.trainTrgF)
     self.valData = Corpus.BiCorpus(self.srcVocab, self.trgVocab,
                                    Config.valSrcF, Config.valTrgF)
     self.valBleuData = Corpus.ValCorpus(self.srcVocab, self.trgVocab,
                                         Config.valFile, Config.refCount)
     self.decoder = NMT_Decoder.NMT_Decoder(self.model, self.srcVocab,
                                            self.trgVocab)
     self.networkBucket = {}
     self.exampleNetwork = self.getNetwork(1, 1)
     self.bestValCE = 999999
     self.bestBleu = 0
     self.badValCount = 0
     self.maxBadVal = 5
     self.learningRate = Config.LearningRate
     if os.path.isfile(Config.initModelF):
         self.model.loadModel(Config.initModelF)
示例#23
0
    def __init__(self, lcc_instance_node):
        super().__init__()
        self.target_cm = [lcc_instance_node.get('targetConcept')]
        annotations_element = lcc_instance_node.find(".//Annotations")

        met_anns = annotations_element.find(".//MetaphoricityAnnotations")
        self.met_score = sum([float(m.get('score'))
                              for m in met_anns]) / len(met_anns)

        cm_source_anns = annotations_element.find(".//CMSourceAnnotations")
        self.source_cm = []
        if cm_source_anns is not None:
            self.source_cm = set([(cm.get("sourceConcept"),
                                   float(cm.get("score")))
                                  for cm in cm_source_anns
                                  if float(cm.get('score')) >= 0])

        self.chain = lcc_instance_node.get('chain')
        self.id = lcc_instance_node.get('id')

        all_text = lcc_instance_node.find(".//TextContent")
        self.current_text = all_text.find(".//Current")
        self.prev_text = all_text.find(".//Prev")
        self.next_text = all_text.find(".//Next")

        self.source_lm = self.current_text.find(".//LmSource").text.strip()
        self.target_lm = self.current_text.find(".//LmTarget").text.strip()

        i = 0
        all_words = []
        for word_group in self.current_text.itertext():
            if word_group.strip() == self.source_lm:
                met = ["source", self.source_cm, self.met_score]
            elif word_group.strip() == self.target_lm:
                met = ["target", self.target_cm, self.met_score]
            else:
                met = ["N", "", ""]

            for w in [
                    w for w in re.findall(r"[\w']+|[.,?!;:\"']", word_group)
                    if w != "="
            ]:
                self.words.append(
                    Corpus.Word(text=w, met=met, index=i, sentence=self))
                i += 1
示例#24
0
def re_gen(dataset, type, id):
    corpus = Corpus(config['CORPUS'][dataset], dataset)
    tmp_dir = f'./tmp/{dataset}/{type}/{id}'
    create_dir(tmp_dir)

    def get_random_corpus_file(type):
        original_file_path = random.sample(
            glob.glob(
                os.path.join(get_repo_dir(dataset),
                             f'./{type}/*/*-orig.java')), 1)[0]
        original_file_name = original_file_path.split('/')[-1].split(
            '-orig')[0] + '.java'
        tmp_original_path = os.path.join(tmp_dir, original_file_name)
        shutil.copy(original_file_path, tmp_original_path)
        return (original_file_name, '', tmp_original_path)

    gen_errored(corpus, get_random_corpus_file, dataset, type, id,
                get_repo_dir(dataset))
示例#25
0
def do_stat(match_path):
    import Corpus
    counts = {}
    conflicts = set()
    reader = Corpus.TRECReader()
    reader.open(match_path)
    doc = reader.next()
    doc_count = 0
    t0 = time.time()
    total_count = 0
    while doc:
        for token in doc.text.split():
            pos = token.find('/')
            if pos > 0:
                tag_string = token[pos + 1:]
                if tag_string.startswith('[') and tag_string.endswith(']'):
                    conflict_set = set()
                    for tag_token in tag_string[1:-1].split(','):
                        if tag_token.startswith('wiki:'):
                            conflict_set.add(tag_token)
                            total_count += 1
                            if counts.has_key(tag_token):
                                counts[tag_token] += 1
                            else:
                                counts[tag_token] = 1
                    if len(conflict_set) > 1:
                        conflicts.add(' '.join(list(conflict_set)))
        doc = reader.next()
        doc_count += 1
        if doc_count % 1000 == 0:
            print doc_count, time.time() - t0, total_count, len(counts), len(
                conflicts)
    count_array = map(lambda tag_count: (tag_count[1], tag_count[0]),
                      counts.items())
    count_array.sort(reverse=True)
    for count, tag in count_array:
        print count, tag
    for conflict in conflicts:
        print conflict
示例#26
0
from Corpus import *

if __name__ == '__main__':

    #Read corpus
    #corp_path = raw_input("Please input the path of the corpus:\n")
    #train_file = raw_input("Please input the filename of the training data:\n")
    #gold_file = raw_input("Please input the filename of the gold label:\n")
    train_file = "trail.csv"
    gold_file = "trial.labels"
    corpus = Corpus(train_file)
    #corpus.readCourpus()

    #training part.
    '''..to be complete
	
	'''

    predict_file = "trial.predict"

    #Evaluation
    if (corpus.gold_file != gold_file):
        corpus.readGold(gold_file)
    if (corpus.predict_file != predict_file):
        corpus.readPrediction(predict_file)
    corpus.evaluation()
    corpus.print_result()
    def tag (self, corpus):
        """scoring of each sentence of the corpus"""

        res = Corpus()
        res.sentences = [self.viterbi(s) for s in corpus.sentences]
        return res
def analys(corpus_name):

    corpus = Corpus(corpus_path[corpus_name], corpus_name)
    corpus.get_data()

    path = "plot/"
    X, Y = corpus.data
    print("size", len(Y))

    circle_plot(Histograme(corpus),
                path + "/" + corpus_name + "/",
                title=corpus_name + " : distribution of relationships")

    st = get_stop_words('en')
    st.extend(string.punctuation)
    st.extend([str(i) for i in range(10)])

    def rm_stop_words(dic):
        for i in st:
            if i in dic:
                dic[i] = 0
        return dic

    vocab, vocab_rel = get_vocab(corpus)
    vocab[''] = 0
    vocab = rm_stop_words(vocab)
    H = pd.DataFrame.from_dict(vocab, orient='index').nlargest(20,
                                                               0).to_dict()[0]
    histo(H, path + "/" + corpus_name + "/", title=corpus_name + " Histo")

    for i in get_rel_class(corpus):
        vocab = vocab_rel[i]
        vocab[''] = 0
        vocab = rm_stop_words(vocab)
        for k in H:
            if k in vocab:
                vocab[k] = 0
        Hi = pd.DataFrame.from_dict(vocab,
                                    orient='index').nlargest(20,
                                                             0).to_dict()[0]
        histo(Hi,
              path + "/" + corpus_name + "/",
              title=corpus_name + " relation " + i + " Histo")

    dist = Dist(corpus)
    box(dist, path + "/" + corpus_name + "/", title=corpus_name + " distances")

    dist = Dist(corpus)
    mean_frame, std = [], []
    for rel in dist.keys():
        df = pd.DataFrame.from_dict({rel: dist[rel]})
        mean_frame.append(df.mean())
        std.append(df.std())

    mean = pd.DataFrame(pd.concat(mean_frame), columns=["mean"])
    std = pd.DataFrame(pd.concat(std), columns=["std"])
    res = pd.concat((mean, std), axis=1)

    data = {'sentence length': [], 'Vocab': [], 'tokenisation length': []}
    tokenizer_bert, _ = get_bert()
    tokenizer_scibert, _ = get_bert(bert_type='scibert')

    for x in X:
        data['sentence length'].append(len(x[0].split(' ')))
        data['Vocab'].append('BERT VOCAB')
        data['tokenisation length'].append(len(tokenizer_bert.tokenize(x[0])))

        data['sentence length'].append(len(x[0].split(' ')))
        data['Vocab'].append('SciBERT VOCAB')
        data['tokenisation length'].append(
            len(tokenizer_scibert.tokenize(x[0])))

    data = pd.DataFrame(data)
    data = data.sort_values(by=['sentence length'])
    print(data)

    title = corpus_name + " tokenisation analysis"

    plt.rcParams["figure.figsize"] = (9, 9)

    pylab.mpl.style.use('seaborn')

    g = sns.relplot(x="sentence length",
                    y="tokenisation length",
                    hue="Vocab",
                    style="Vocab",
                    hue_order=['SciBERT VOCAB', 'BERT VOCAB'],
                    kind="line",
                    data=data,
                    col_order=['SciBERT VOCAB', 'BERT VOCAB'],
                    style_order=['SciBERT VOCAB', 'BERT VOCAB'])
    sns.despine()
    plt.title(title)

    plt.show()
    plt.savefig(title + ".png")
示例#29
0
def train():
    print "train"
    start_time = time.time()
    config = SiameseTCNNConfig()
    corpus = Corpus(train_file, vocab_file, 0.0, config.seq_length,
                    config.vocab_size)
    testcorpus = Corpus(test_file, vocab_file, 1.0, config.seq_length,
                        config.vocab_size)
    print(corpus)
    print(testcorpus)

    config.vocab_size = len(corpus.words)

    train_data = TensorDataset(torch.LongTensor(corpus.x_train1),
                               torch.LongTensor(corpus.x_train2),
                               torch.FloatTensor(corpus.y_train))
    test_data = TensorDataset(torch.LongTensor(testcorpus.x_test1),
                              torch.LongTensor(testcorpus.x_test2),
                              torch.FloatTensor(testcorpus.y_test))

    print('Configuring CNN model...')
    model = SiameseTextCNN(config)
    print(model)

    # optimizer and loss function
    # criterion = nn.CrossEntropyLoss(size_average=False)
    # criterion = torch.nn.BCELoss(reduce=False, size_average=False)
    if config.contra_loss:
        criterion = ContrastiveLoss()
    optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

    # set the mode to train
    print("Training and evaluating...")
    best_F1 = 0.0

    for epoch in range(config.num_epochs):
        # load the training data in batch
        model.train()
        train_loader = DataLoader(train_data, batch_size=config.batch_size)
        ii = 0
        for x1_batch, x2_batch, y_batch in train_loader:
            ii += 1
            if ii % 100 == 0: print epoch, "batch", ii
            inputs1, inputs2, targets = Variable(x1_batch), Variable(
                x2_batch), Variable(y_batch)

            optimizer.zero_grad()
            outputs1, outputs2 = model(inputs1, inputs2)  # forward computation

            loss = criterion(outputs1, outputs2, targets)
            """
            todo
            """
            # backward propagation and update parameters
            loss.backward()
            optimizer.step()

        # evaluate on both training and test dataset

        print "epoch", epoch
        train_loss, train_F1 = evaluate(train_data, model, criterion)
        test_loss, test_F1 = evaluate(test_data, model, criterion)
        #print "train_loss:",train_loss

        if test_F1 > best_F1:
            # store the best result
            best_F1 = test_F1
            improved_str = '*'
            torch.save(model.state_dict(), model_file)
        else:
            improved_str = ''

        time_dif = get_time_dif(start_time)
        msg = "Epoch {0:3}, Train_loss: {1:>7.3}, Train_F1 {2:>6.3%}, " \
              + "Test_loss: {3:>6.3}, Test_F1 {4:>6.3%}, Time: {5} {6}"
        print(
            msg.format(epoch + 1, train_loss, train_F1, test_loss, test_F1,
                       time_dif, improved_str))
示例#30
0
        except:
            author = i['author']['name']
        txt = i['title'] + ". " + i['summary']
        txt = txt.replace('\n', ' ')
        txt = txt.replace('\r', ' ')
        try:
            coAuth = [aut['name'] for aut in i['author']][1:]
        except:
            coAuth = "Pas de Co-Auteur"
        doc = Document.ArxivDocument(datet, i['title'], author, txt, i['id'],
                                     coAuth)
        corpus_Arxiv.add_doc(doc)


#Initialisation des corpus
corpus_Reddit = Corpus.Corpus("Corona_red")
corpus_Arxiv = Corpus.Corpus("Corona_arx")

#Chargement des données dans les corpus
loadArxiv(corpus_Arxiv)
loadReddit(corpus_Reddit)

#Affichage du nombre de documents et d'auteurs
print("Création du corpus Reddit, %d documents et %d auteurs" %
      (corpus_Reddit.ndoc, corpus_Reddit.naut))
print("Création du corpus Arxiv, %d documents et %d auteurs" %
      (corpus_Arxiv.ndoc, corpus_Arxiv.naut))

print()

#Enregistrement des corpus
示例#31
0
            del content[:]
    return all_docs

# Returns the maximum X terms of a list
def get_top_terms(dict, number_of_terms):
    sorted_tf_idf_list = sorted(dict.items(),
                            key=operator.itemgetter(1), reverse=True)
    return sorted_tf_idf_list[0:number_of_terms]


print os.listdir(".")

# Read text files from the base folder
for file in os.listdir(base_folder):
    print "Now grabbing contents from file", file
    cor = Corpus(file)
    f = codecs.open(base_folder + file, encoding="utf-8")
    cor.list_of_documents = import_file(f)
    print "Number of documents is", len(cor.list_of_documents)
    f.close()

    corpses.append(cor)

# Print stats for all corpuses
for corpse in corpses:
    corpse.print_stats()

while True:
    input = raw_input("Enter word: ")

    count = 0
示例#32
0
class DataSpace:
    corpus = Corpus()
    featurematrix = []
    doctermmatrix = []
   
    classlabels = []
    spacename = ""     # numoffiles_classtask
    
    def __init__(self):
        self.corpus = Corpus()
        self.featurematrix = []
        self.doctermmatrix = []
        
        self.classlabels = []
        self.spacename = ""
        
    def __getfeaturematrix(self):
        fname = IOtools.picklepath+os.sep+self.spacename+"_featurematrix.p"
        return self.featurematrix, fname
    def __getdoctermmatrix(self):
        fname = IOtools.picklepath+os.sep+self.spacename+"_doctermmatrix.p"
        return self.doctermmatrix, fname
    def __getcorpora(self):
        fname = IOtools.picklepath+os.sep+self.spacename+"_corpora.p"
        return self.corpora, fname
    
    
    def __dumpfeaturematrix(self):
        fname = IOtools.picklepath+os.sep+self.spacename+"_featurematrix.p"
        pickle.dump(self.featurematrix, open(fname, "wb"))
    def __dumpdoctermmatrix(self):
        fname = IOtools.picklepath+os.sep+self.spacename+"_doctermmatrix.p"
        pickle.dump(self.doctermmatrix, open(fname, "wb"))
    def __dumpcorpora(self):
        fname = IOtools.picklepath+os.sep+self.spacename+"_corpora.p"
        pickle.dump(self.corpora, open(fname, "wb"))
    
    
    ''' nfile is a dict storing the number of files per classlabel to be read  
        nfile default value 0 rec.'''
    def buildcorpus(self, nfile, resourcepath, classlabels, corpusname, taskname, plaintext, nostopwords):
        labelwisepathlist = {}
        
        for classlabel in classlabels:
            labelwisepathlist[classlabel] = []
            
        for classlabel in classlabels:
            p = resourcepath + os.sep + classlabel + os.sep
            fileids = []
            fileids = IOtools.getfilenames_of_dir(p, removeextension=False)[:nfile[classlabel]]
            
            labelwisepathlist[classlabel].extend(fileids)
            
            
        self.corpus.setname(corpusname)
        self.corpus.read_corpus(labelwisepathlist, plaintext, nostopwords)
        ncat = len(classlabels)
        self.spacename = taskname+"-"+str(nfile*ncat)+"texts"
        
        

    def compute_tfidf(self):
        
        
        
        ''' matrix leri duzelt.  csv olarak kaydet  '''
    def build_featurematrix(self):
        for corpus in self.corpora:
            datapoints = corpus.build_featurematrix()
            for k,v in datapoints.iteritems():
                self.featurematrix.append([k]+v+[corpus.label])
        self.record_matrix(self.featurematrix, "featureMATRIX")
       
        
    def build_termdocmatrix(self):
        cfdDocTerm = nltk.ConditionalFreqDist()
                
        #docs = []
        labelleddocs = []
        for corpus in self.corpora:
            cfd = corpus.build_termmatrix()
            label = corpus.label
            print label
            for term in cfd.conditions():
                #docs.extend(list(cfd[term]))
                #labelleddocs = [(doc, label) for doc in docs]
                #print list(cfd[term])
                for fileid in list(cfd[term]):
                    cfdDocTerm[term].inc(fileid)
                    labelleddocs.append((fileid, label))
        
        print labelleddocs           
        labelleddocs = list(set(labelleddocs))
        print labelleddocs
        CFDhelpers.recordCFD(cfdDocTerm, self.spacename+"CFDdocterm")
        
        matrix = []
        matrix.append(cfdDocTerm.conditions())
        
        
        for fileid,label in labelleddocs:
            row = []
            for term in cfdDocTerm.conditions():
                numofoccurrences = cfdDocTerm[term][fileid]
                row.append(numofoccurrences)
            self.doctermmatrix.append([fileid]+row+[label])
            matrix.append([fileid]+row+[label])
        
                       
        self.record_matrix(matrix, "DocTermMATRIXn")
        self.record_matrix(self.doctermmatrix, "DocTermMatrix")
        
        self.__dumpdoctermmatrix()
             
    
    def record_matrix(self, matrix, mname):
        fname = IOtools.matrixpath+os.sep+mname+"-"+self.spacename+"MATRIX.m"
        IOtools.todisc_matrix(matrix, fname)
示例#33
0
import Corpus
import numpy as np

import IBM1
import IBM2
import HMM

print("loading the corpus...")
corpus = Corpus.Corpus("eutrans/training", separator="#")
#corpus = Corpus.Corpus("corpus.txt", separator="---")
corpus.print_corpus_description()
print("...done")

#%% Testing IBM1
# print(" ")
# print("*"*50)
# print(" ")
# print("Building IBM1 item...")
# ibm1 = IBM1.IBM1(corpus)
# print("...done")
# print("starting to train IBM1...")
# ibm1_nb_training_step = 10
# imb1perplexityevol = ibm1.train(ibm1_nb_training_step, verbose=True)
# print("...done")
#
# print "\nIBM1 perplexity : ",ibm1.get_perplexity(),"\n"
#
# f2e = np.argmax(ibm1.proba_f_knowing_e,axis=1)
# print "IBM1 Translations :"
# for i in range(len(corpus.french_words)):
#     print corpus.french_words[i], " --> ", corpus.english_words[f2e[i]]
示例#34
0
from glove import Glove
import Corpus

corpus = Corpus()

sent_token = [["안녕", "하세요"], ["지니티토리", "입니다"]]

corpus.fit(sent_token, window=20)

# model
glove = Glove(no_components=128, learning_rate=0.01)
glove.fit(corpus.matrix, epochs=50, no_threads=4, verbose=False)
glove.add_dictionary(corpus.dictionary)

# save
glove.save(DATA_DIR + '/glove_w20_epoch50.model')